diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 6de6ae9..5c1f294 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -268,22 +268,22 @@ func main() { if !skipEstimate { es := e.Summarize(!noMMap) - if ctxSize <= 0 { - if a.MaximumContextLength == 0 { - a = gf.Architecture() - } - ctxSize = int(a.MaximumContextLength) - } tprintf( "ESTIMATE", - []string{"Context Size", "Mem. Arch", "Usage"}, + []string{"Arch", "Context Size", "Full Offload", "MMap Support", "Mem. Arch", "Usage"}, []string{ - sprintf(ctxSize), + sprintf(e.Architecture), + sprintf(e.ContextSize), + sprintf(e.FullOffload), + sprintf(!e.NoMMap), "UMA", sprintf(es.UMA), }, []string{ - sprintf(ctxSize), + sprintf(e.Architecture), + sprintf(e.ContextSize), + sprintf(e.FullOffload), + sprintf(!e.NoMMap), "NonUMA", fmt.Sprintf("%s (RAM) + %s (VRAM)", es.NonUMA.RAM, es.NonUMA.VRAM), }) @@ -330,7 +330,7 @@ func tprintf(title string, header []string, body ...[]string) { tb.SetAlignment(tablewriter.ALIGN_CENTER) tb.SetHeaderLine(true) tb.SetRowLine(true) - tb.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3}) + tb.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3, 4}) tb.Append(append([]string{title}, header...)) for i := range body { tb.Append(append([]string{title}, body[i]...)) diff --git a/file_architecture.go b/file_architecture.go index dd6d710..b1b23b2 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -79,8 +79,6 @@ type GGUFArchitectureMetadata struct { /* Appendix */ - // EmbeddingHeadCount is the number of heads in the embedding layer. - EmbeddingHeadCount uint64 `json:"embeddingHeadCount,omitempty"` // EmbeddingKeyGQA is the number of key GQA in the embedding layer. EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"` // EmbeddingValueGQA is the number of value GQA in the embedding layer. @@ -261,10 +259,15 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { ga.VocabularyLength = v.ValueArray().Len } - if ga.AttentionHeadCount > 0 { - ga.EmbeddingHeadCount = ga.EmbeddingLength / ga.AttentionHeadCount - ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV - ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV + { + if ga.AttentionHeadCount > 0 { + ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV + ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV + } + if ga.Architecture == "mamba" { + ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize) + ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize) + } ga.EmbeddingGQA = ga.EmbeddingValueGQA } diff --git a/file_estimate.go b/file_estimate.go index 39262af..5d49e1b 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -11,12 +11,16 @@ import ( type ( // LLaMACppUsageEstimate represents the estimated result of loading the GGUF file in llama.cpp. LLaMACppUsageEstimate struct { + // Architecture describes what architecture this model implements. + Architecture string `json:"architecture"` // FullOffload is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffload bool `json:"fullOffload"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` + // ContextSize is the size of the context. + ContextSize uint64 `json:"contextSize"` // Load is the memory usage for running the GGUF file in RAM. Load LLaMACppMemoryUsage `json:"load"` // Offload is the memory usage for loading the GGUF file in VRAM. @@ -72,52 +76,94 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( for _, opt := range opts { opt(&o) } + if o.CacheKeyType == nil { + o.CacheKeyType = ptr.To(GGMLTypeF16) + } + if o.CacheValueType == nil { + o.CacheValueType = ptr.To(GGMLTypeF16) + } a, t := gf.Architecture(), gf.Tokenizer() + e.Architecture = a.Architecture + + // Init hyperparameters, + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000. + var ( + nContext uint64 + nTokens uint64 + nBatch uint64 + nOutputs uint64 + nParallel uint64 + nKV uint64 + ) + { + nContext = a.MaximumContextLength + if o.ContextSize != nil { + nContext = uint64(*o.ContextSize) + } + // Correct token size, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224. + nTokens = min(nContext, uint64(ptr.Deref(o.BatchSize, 512))) + nBatch = nTokens + nOutputs = nTokens + nParallel = uint64(ptr.Deref(o.ParallelSize, 1)) + nKV = nContext + + // For mamba, + // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. + if a.Architecture == "mamba" { + nKV = nParallel + o.CacheKeyType = ptr.To(GGMLTypeF32) + o.CacheValueType = ptr.To(GGMLTypeF32) + } - nContext := a.MaximumContextLength - if o.ContextSize != nil { - nContext = uint64(*o.ContextSize) + e.ContextSize = nContext } + // Full offload: isOffloadOutputLayer && nLoadLayers == 0. + // Partial offload: nLoadLayers > 0 && nOffloadLayers > 0. + // Zero offload: nOffloadLayers == 0. var ( - nLoadLayers = a.BlockCount - nOffloadLayers uint64 - nBatch = min(nContext, uint64(ptr.Deref(o.BatchSize, 512))) - nParallel = uint64(ptr.Deref(o.ParallelSize, 1)) + nLoadLayers = a.BlockCount + nOffloadLayers uint64 + isOffloadOutputLayer bool ) { if v := o.OffloadLayers; v == nil { o.OffloadLayers = ptr.To(a.BlockCount) - nOffloadLayers = nLoadLayers + nOffloadLayers = a.BlockCount } else if *v > 0 { nOffloadLayers = *v - if nOffloadLayers > nLoadLayers { - nOffloadLayers = nLoadLayers + if nOffloadLayers >= a.BlockCount+1 { + isOffloadOutputLayer = true + } + if nOffloadLayers > a.BlockCount { + nOffloadLayers = a.BlockCount } } nLoadLayers -= nOffloadLayers + + e.FullOffload = isOffloadOutputLayer && nLoadLayers == 0 } - e.FullOffload = a.BlockCount == nOffloadLayers // Footprint. { // Bootstrap. - e.Load.Footprint = GGUFBytesScalar(10 * 1024 * 1024) - e.Load.Footprint += gf.Size - gf.ModelSize + e.Load.Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ - // Tokens. + // Tokens, + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6380-L6384. fp := t.TokensLength * (4 /* token type */ + 4 /* token score*/) if t.Model == "gpt2" { fp += t.MergesLength * (48 /* key type */ + 56 /* value type */) } fp += t.TokensLength * (32 /* id to token vector */ + (24 + 32) /* token to id map*/) + e.Load.Footprint += GGUFBytesScalar(fp) // Output buffer, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. ob := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel - - e.Load.Footprint += GGUFBytesScalar(fp + ob) + e.Load.Footprint += GGUFBytesScalar(ob) } ls := gf.Layers() @@ -146,8 +192,12 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // IO, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002. e.Load.Weight.Input = GGUFBytesScalar(ipLs.Bytes()) - e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes()) - if nOffloadLayers == a.BlockCount { + if _, ok := opLs.Get("output.weight"); ok { + e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes()) + } else { + e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes() + ioLs.Bytes() /* duplicate the input layer */) + } + if isOffloadOutputLayer && nLoadLayers == 0 { // Full offloaded. // Transfer the output weight to VRAM when all layers are offloaded. e.Offload.Weight.Output = e.Load.Weight.Output e.Load.Weight.Output = 0 @@ -157,28 +207,8 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // KV cache, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. { - kt, vt := GGMLTypeF16, GGMLTypeF16 - nKV := nContext - if o.CacheKeyType != nil { - kt = *o.CacheKeyType - } - if o.CacheValueType != nil { - vt = *o.CacheValueType - } - if a.Architecture == "mamba" { - // See https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. - kt, vt = GGMLTypeF32, GGMLTypeF32 - nKV = nParallel - } - - embedKeyGQA, embedValGQA := a.EmbeddingKeyGQA, a.EmbeddingValueGQA - if a.SSMConvolutionKernel > 0 { - embedKeyGQA += uint64(a.SSMConvolutionKernel - 1*a.SSMInnerSize) - embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize) - } - - krs := kt.RowSizeOf([]uint64{embedKeyGQA * nKV}) - vrs := vt.RowSizeOf([]uint64{embedValGQA * nKV}) + krs := o.CacheKeyType.RowSizeOf([]uint64{a.EmbeddingKeyGQA * nKV}) + vrs := o.CacheValueType.RowSizeOf([]uint64{a.EmbeddingValueGQA * nKV}) e.Load.KVCache.Key = GGUFBytesScalar(krs * nLoadLayers) e.Load.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) @@ -188,64 +218,90 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // Computation. { + // Bootstrap, compute metadata, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136. + cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + + GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) + e.Load.Computation.Footprint = GGUFBytesScalar(cm) + + // Scheduler overhead, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. + e.Load.Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) + // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) - - // Graph overhead. - oh := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + - GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) - - e.Load.Computation.Footprint = GGUFBytesScalar(gc + oh) + e.Load.Computation.Footprint += GGUFBytesScalar(gc) // Tensor usage, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. // - // Firstly, get the usage of input layer. + // First, get the usage of input layer, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2279-L2290. var ( inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch] - inpPos = GGMLTypeI32.RowSizeOf([]uint64{nContext}) // I32 [n_tokens] - inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nContext}) // I32 [n_output], - inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nContext, nBatch}) // F32 [n_kv, n_batch] + inpPos = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] + inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs}) // I32 [n_outputs], + inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch}) // F32 [n_kv, n_batch] + inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nKV}) // F32 [1, n_kv] + inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nKV, nBatch}) // I32 [n_kv, n_batch] ) - e.Load.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) - e.Offload.Computation.Input = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) + if a.Architecture == "mamba" { + e.Load.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpSMask + inpSSeq + inpOutIds) + e.Offload.Computation.Input = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq + inpOutIds) + } else { + e.Load.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) + e.Offload.Computation.Input = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) + } // Since the steps between transformer layers are serial, // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. - { + if a.Architecture == "mamba" { + convInc := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingKeyGQA, nKV}) // F32 [n_embd_key_gqa, n_kv] reshape + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { + if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + convInc += rs + continue + } + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379. + rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nKV}) + convInc += rs + } + ssmInc := uint64(0) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) { + if !strings.HasSuffix(l.Name, ".ssm_a") { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + ssmInc += rs + continue + } + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413. + rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nKV}) + ssmInc += rs + } + e.Offload.Computation.Compute = GGUFBytesScalar(convInc + ssmInc) + } else { kvcInc := uint64(e.Load.KVCache.Key + e.Offload.KVCache.Key) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) kvcInc += rs switch { default: continue case strings.HasSuffix(l.Name, ".attn_q.weight"): case strings.HasSuffix(l.Name, ".attn_qkv.weight"): - rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nBatch}) + rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens}) } kvcInc += rs * 2 // for RoPE } ffnInc := uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) ffnInc += rs } e.Offload.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc)) - switch { - case nLoadLayers == 0: // Zero offloaded. - e.Load.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc)) - case nLoadLayers > 0 && nOffloadLayers > 0: // Partial offloaded. - ffnInc = 0 - for _, l := range tfLs[nLoadLayers-1].Search(regexp.MustCompile(`.*\.\d+\.ffn_(norm|gate|up)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) - ffnInc += rs - } - e.Load.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc)) - } // Special case: we cannot use mmap for splitting expert weights in MoE. if a.ExpertCount > 0 { e.NoMMap = len(tfLs[0].Search(regexp.MustCompile(`.*\.\d+\.ffn_gate_exps\.weight`))) == 0 @@ -254,10 +310,17 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // Finally, get the usage of output layer. { outInc := inpEmbd + if a.Architecture == "mamba" { + outInc += inpSMask + inpSSeq + } if l, ok := opLs.Get("output.weight"); ok { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + outInc += rs + } else if l, ok := ipLs.Get("token_embd.weight"); ok { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs } + outInc += uint64(e.Load.Weight.Output) e.Offload.Computation.Output = GGUFBytesScalar(outInc) } } @@ -281,9 +344,11 @@ type LLaMACppUsageEstimateSummery struct { func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummery) { // UMA. { - kv := e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + fp := e.Load.Footprint + e.Offload.Footprint wg := e.Load.Weight.Sum() + e.Offload.Weight.Sum() - es.UMA = e.Load.Footprint + max(kv, e.Load.Computation.Sum()) + wg + kv := e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + cp := e.Load.Computation.Sum() + es.UMA = fp + wg + kv + cp if !e.NoMMap && mmap { es.UMA -= wg } @@ -291,7 +356,7 @@ func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSum // TODO(thxCode): complete more cases, // and support optional parameters for the following constants. - + // // Footprint, // see https://github.com/ggerganov/llama.cpp/blob/f578b86b2123d0f92afbaa98a031df4d4464e582/llama.cpp#L2454-L2486. const ( @@ -303,12 +368,21 @@ func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSum // NonUMA. { + // RAM. + fp := cudaFootprint + e.Load.Footprint wg := e.Load.Weight.Sum() - es.NonUMA.RAM = cudaFootprint + e.Load.Footprint + e.Load.KVCache.Sum() + e.Load.Computation.Sum() + wg - e.Load.Computation.Compute + kv := e.Load.KVCache.Sum() + cp := e.Load.Computation.Sum() + es.NonUMA.RAM = fp + wg + kv + cp if !e.NoMMap && (mmap || e.FullOffload) { es.NonUMA.RAM -= wg } - es.NonUMA.VRAM = e.Offload.Footprint + e.Offload.Weight.Sum() + e.Offload.KVCache.Sum() + e.Offload.Computation.Sum() + // VRAM. + fp = e.Offload.Footprint + wg = e.Offload.Weight.Sum() + kv = e.Offload.KVCache.Sum() + cp = e.Offload.Computation.Sum() + es.NonUMA.VRAM = fp + wg + kv + cp } return es