From 0f7733dff230e282fa9e68660ddb5f0b26e1ab39 Mon Sep 17 00:00:00 2001 From: thxCode Date: Tue, 11 Jun 2024 13:47:23 +0800 Subject: [PATCH] refactor: estimate Signed-off-by: thxCode --- README.md | 12 +- cmd/gguf-parser/main.go | 18 ++- file_estimate.go | 299 ++++++++++++++++++++++------------------ file_estimate_option.go | 28 ++-- file_estimate_test.go | 34 ++--- 5 files changed, 212 insertions(+), 179 deletions(-) diff --git a/README.md b/README.md index 84013d1..f9a877b 100644 --- a/README.md +++ b/README.md @@ -107,20 +107,26 @@ spew.Dump(f.Tokenizer()) ``` -### Estimate usage +### Estimate usage in [llama.cpp](https://github.com/ggerganov/llama.cpp) ```go -spew.Dump(f.Estimate()) +spew.Dump(f.EstimateLLaMACppUsage()) ``` #### Estimate with larger prompt ```go -spew.Dump(f.Estimate(WithContextSize(4096) /* 4K */)) +spew.Dump(f.EstimateLLaMACppUsage(WithContextSize(4096) /* 4K */)) ``` +#### Estimate with specific offload layers + +```go +spew.Dump(f.EstimateLLaMACppUsage(WithOffloadLayers(10))) +``` + ## License MIT diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 577a57c..0bfaefe 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -103,7 +103,7 @@ func main() { ropts = append(ropts, SkipTLSVerification()) } - eopts := []GGUFEstimateOption{ + eopts := []LLaMACppUsageEstimateOption{ WithCacheValueType(GGMLTypeF16), WithCacheKeyType(GGMLTypeF16), } @@ -168,7 +168,7 @@ func main() { m GGUFModelMetadata a GGUFArchitectureMetadata t GGUFTokenizerMetadata - e GGUFEstimate + e LLaMACppUsageEstimate ) if !skipModel { m = gf.Model() @@ -180,7 +180,7 @@ func main() { t = gf.Tokenizer() } if !skipEstimate { - e = gf.Estimate(eopts...) + e = gf.EstimateLLaMACppUsage(eopts...) } // Output @@ -197,7 +197,7 @@ func main() { o["tokenizer"] = t } if !skipEstimate { - es := e.Sum(!noMMap) + es := e.Summarize(!noMMap) o["estimate"] = es } @@ -267,7 +267,7 @@ func main() { } if !skipEstimate { - es := e.Sum(!noMMap) + es := e.Summarize(!noMMap) if ctxSize <= 0 { if a.MaximumContextLength == 0 { a = gf.Architecture() @@ -276,20 +276,18 @@ func main() { } tprintf( "ESTIMATE", - []string{"Mem. Arch", "MMap", "Context Size", "(CPU) RAM", "(GPU) VRAM"}, + []string{"Mem. Arch", "MMap", "Context Size", "Usage"}, []string{ "UMA", sprintf(!noMMap), sprintf(ctxSize), - sprintf(es.UMA.RAM), - sprintf(es.UMA.VRAM), + sprintf(es.UMA), }, []string{ "NonUMA", sprintf(!noMMap), sprintf(ctxSize), - sprintf(es.NonUMA.RAM), - sprintf(es.NonUMA.VRAM), + fmt.Sprintf("%s(RAM) + %s (VRAM)", es.NonUMA.RAM, es.NonUMA.VRAM), }) } } diff --git a/file_estimate.go b/file_estimate.go index 7a43727..9abb32d 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -7,57 +7,66 @@ import ( "github.com/thxcode/gguf-parser-go/util/ptr" ) -// GGUFEstimate represents the estimated result of the GGUF file. -type GGUFEstimate struct { - // Load is the memory usage of the load part. - Load GGUFMemoryUsage `json:"load"` - // Offload is the memory usage of the offload part. - Offload GGUFMemoryUsage `json:"offload"` -} - +// Types for LLaMACpp estimation. type ( - // GGUFMemoryUsage represents the memory usage of the GGUF file. - GGUFMemoryUsage struct { - // Weight is the memory usage of weight. - Weight GGUFWeightUsage `json:"weight"` - // KVCache is the usage of key-value cache. - KVCache GGUFKVCacheUsage `json:"kvCache"` - // Tokens is the memory usage of token. - Tokens GGUFBytesScalar `json:"tokens"` - // Compute is the memory usage of computation. - Compute GGUFComputeUsage `json:"compute"` + // LLaMACppUsageEstimate represents the estimated result of loading the GGUF file in llama.cpp. + LLaMACppUsageEstimate struct { + // Layers is the number of layers for loading the GGUF file. + Layers uint64 `json:"layers"` + // OffloadLayers is the number of layers to offload. + OffloadLayers uint64 `json:"offloadLayers"` + // RAM is the memory usage for loading the GGUF file in RAM. + RAM LLaMACppMemoryUsage `json:"ram"` + // VRAM is the memory usage for loading the GGUF file in VRAM. + VRAM LLaMACppMemoryUsage `json:"vram"` } - // GGUFWeightUsage represents the memory usage of model weight. - GGUFWeightUsage struct { - // Compute is the memory usage of computing. - Compute GGUFBytesScalar `json:"compute"` - // Input is the memory usage of input. + // LLaMACppMemoryUsage represents the memory usage for expanding the GGUF file in llama.cpp. + LLaMACppMemoryUsage struct { + // Footprint is the memory footprint for bootstrapping. + Footprint GGUFBytesScalar `json:"footprint"` + // Weight is the memory usage of loading weights. + Weight LLaMACppWeightUsage `json:"weight"` + // KVCache is the memory usage of caching previous KV. + KVCache LLaMACppKVCacheUsage `json:"kvCache"` + // Computation is the memory usage of computation. + Computation LLaMACppComputationUsage `json:"computation"` + } + + // LLaMACppWeightUsage represents the memory usage of loading weights in llama.cpp. + LLaMACppWeightUsage struct { + // Input is the memory usage for loading input tensors. Input GGUFBytesScalar `json:"input"` - // Output is the memory usage of output. + // Compute is the memory usage for loading compute tensors. + Compute GGUFBytesScalar `json:"compute"` + // Output is the memory usage for loading output tensors. Output GGUFBytesScalar `json:"output"` } - // GGUFKVCacheUsage represents the usage of kv-cache. - GGUFKVCacheUsage struct { - // Key is the memory usage of the cached key. + // LLaMACppKVCacheUsage represents the memory usage of caching previous KV in llama.cpp. + LLaMACppKVCacheUsage struct { + // Key is the memory usage for caching previous keys. Key GGUFBytesScalar `json:"key"` - // Value is the memory usage of the cached value. + // Value is the memory usage for caching previous values. Value GGUFBytesScalar `json:"value"` } - // GGUFComputeUsage represents the memory usage of computation. - GGUFComputeUsage struct { - // Graph is the memory usage of computation graph. - Graph GGUFBytesScalar `json:"graph"` - // Others is the trivial usage. - Others GGUFBytesScalar `json:"others"` + // LLaMACppComputationUsage represents the memory usage of computation in llama.cpp. + LLaMACppComputationUsage struct { + // Footprint is the memory footprint for computation. + Footprint GGUFBytesScalar `json:"footprint"` + // Input is the memory usage for input. + Input GGUFBytesScalar `json:"input"` + // Compute is the memory usage for computation. + Compute GGUFBytesScalar `json:"graph"` + // Output is the memory usage for output. + Output GGUFBytesScalar `json:"output"` } ) -// Estimate returns the inference usage estimated result of the GGUF file. -func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { - var o _GGUFEstimateOptions +// EstimateLLaMACppUsage returns the inference memory usage estimated result of the GGUF file. +func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (e LLaMACppUsageEstimate) { + var o _LLaMACppUsageEstimateOptions for _, opt := range opts { opt(&o) } @@ -87,6 +96,28 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { } nLoadLayers -= nOffloadLayers } + e.Layers = a.BlockCount + e.OffloadLayers = nOffloadLayers + + // Footprint. + { + // Bootstrap. + e.RAM.Footprint = GGUFBytesScalar(5 * 1024 * 1024) + + // Tokens. + fp := uint64(t.TokensSize) + fp += t.TokensLength * (4 /* token type */ + 4 /* token score*/) + if t.Model == "gpt2" { + fp += uint64(t.MergesSize) + fp += t.MergesLength * (48 /* key type */ + 56 /* value type */) + } + + // Output buffer, + // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. + ob := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel + + e.RAM.Footprint += GGUFBytesScalar(fp + ob) + } ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ @@ -94,29 +125,28 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { "output.weight", "output_norm.weight", }) + ipLs, opLs, _ := ioLs.Cut([]string{ + "token_embd.weight", + }) - // Model weight. + // Weight. { // Compute. for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ { - switch { - case i < nLoadLayers: - ge.Load.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) - case i >= offloadStart: - ge.Offload.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + e.RAM.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + if i >= offloadStart { + e.VRAM.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) } } // IO, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002. - inpLs, outLs, _ := ioLs.Cut([]string{ - "token_embd.weight", - }) - ge.Load.Weight.Input = GGUFBytesScalar(inpLs.Bytes()) - ge.Load.Weight.Output = GGUFBytesScalar(outLs.Bytes()) + e.RAM.Weight.Input = GGUFBytesScalar(ipLs.Bytes()) + e.RAM.Weight.Output = GGUFBytesScalar(opLs.Bytes()) if nOffloadLayers == a.BlockCount { - ge.Offload.Weight.Output = ge.Load.Weight.Output - ge.Load.Weight.Output = 0 + // Transfer the output weight to VRAM when all layers are offloaded. + e.VRAM.Weight.Output = e.RAM.Weight.Output + e.RAM.Weight.Output = 0 } } @@ -149,47 +179,28 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { krs := kt.RowSizeOf([]uint64{embedKeyGQA * nKV}) vrs := vt.RowSizeOf([]uint64{embedValGQA * nKV}) - ge.Load.KVCache.Key = GGUFBytesScalar(krs * nLoadLayers) - ge.Load.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) - ge.Offload.KVCache.Key = GGUFBytesScalar(krs * nOffloadLayers) - ge.Offload.KVCache.Value = GGUFBytesScalar(vrs * nOffloadLayers) + e.RAM.KVCache.Key = GGUFBytesScalar(krs * nLoadLayers) + e.RAM.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) + e.VRAM.KVCache.Key = GGUFBytesScalar(krs * nOffloadLayers) + e.VRAM.KVCache.Value = GGUFBytesScalar(vrs * nOffloadLayers) } - // Tokens. - ge.Load.Tokens += GGUFBytesScalar(t.TokensSize) - ge.Load.Tokens += GGUFBytesScalar(t.TokensLength * (4 /* token type */ + 4 /* token score*/)) - if t.Model == "gpt2" { - ge.Load.Tokens += GGUFBytesScalar(t.MergesSize) - ge.Load.Tokens += GGUFBytesScalar(t.MergesLength * (48 /* key type */ + 56 /* value type */)) - } - - // Compute. + // Computation. { - // Bootstrap. - ge.Load.Compute.Others += GGUFBytesScalar(15 * 1024 * 1024) - // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. - ggmlCtx := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) - ge.Load.Compute.Others += GGUFBytesScalar(ggmlCtx) - - // Output buffer, - // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. - outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel - ge.Load.Compute.Others += GGUFBytesScalar(outBuffer) + gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) // Graph overhead. - graphOverhead := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + + oh := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) - ge.Load.Compute.Others += GGUFBytesScalar(graphOverhead) - } - // Computation graph. - { + e.RAM.Computation.Footprint = GGUFBytesScalar(gc + oh) + // Tensor usage, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. // - // Firstly, get the usage of input tensors. + // Firstly, get the usage of input layer. var ( inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch] @@ -197,88 +208,106 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nContext}) // I32 [n_output], inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nContext, nBatch}) // F32 [n_kv, n_batch] ) - ge.Load.Compute.Graph += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) - if nOffloadLayers > 0 { - ge.Offload.Compute.Graph += GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) - } + e.RAM.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) + e.VRAM.Computation.Input = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) // Since the steps between transformer layers are serial, // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. - kvcInc := uint64(ge.Load.KVCache.Key + ge.Offload.KVCache.Key) - for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) - kvcInc += rs - if strings.HasSuffix(l.Name, ".attn_q.weight") { - kvcInc += rs // for RoPE + { + kvcInc := uint64(e.RAM.KVCache.Key + e.VRAM.KVCache.Key) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + kvcInc += rs + if strings.HasSuffix(l.Name, ".attn_q.weight") { + kvcInc += rs // for RoPE + } } - } - var ffnInc uint64 - for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) - ffnInc += rs - } - if nLoadLayers == a.BlockCount { - ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) - } else { - ge.Offload.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) + ffnInc := uint64(0) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + ffnInc += rs + } + e.VRAM.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc)) if nLoadLayers > 0 { ffnInc = 0 for _, l := range tfLs[nLoadLayers-1].Search(regexp.MustCompile(`.*\.\d+\.ffn_(norm|gate|up)\.weight`)) { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) ffnInc += rs } - ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) + e.RAM.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc)) + } + } + // Finally, get the usage of output layer. + { + outInc := inpEmbd + if l, ok := opLs.Get("output.weight"); ok { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + outInc += rs } + e.VRAM.Computation.Output = GGUFBytesScalar(outInc) } } - return ge + return e } -type ( - GGUFEstimateSum struct { - // UMA is the usage of unified memory architecture. - UMA GGUFEstimateSumItem `json:"uma"` - // NonUMA is the usage of non-unified memory architecture. - NonUMA GGUFEstimateSumItem `json:"nonUMA"` - } - GGUFEstimateSumItem struct { - // RAM is the memory usage of the RAM. +// LLaMACppUsageEstimateSummery represents the summary of the usage for loading the GGUF file in llama.cpp. +type LLaMACppUsageEstimateSummery struct { + // UMA represents the usage of Unified Memory Architecture. + UMA GGUFBytesScalar `json:"uma"` + // NonUMA represents the usage of Non-Unified Memory Architecture. + NonUMA struct { + // RAM is the memory usage for loading the GGUF file in RAM. RAM GGUFBytesScalar `json:"ram"` - // VRAM is the memory usage of the VRAM. + // VRAM is the memory usage for loading the GGUF file in VRAM. VRAM GGUFBytesScalar `json:"vram"` - } -) + } `json:"nonUMA"` +} -func (e GGUFEstimate) Sum(mmap bool) (gs GGUFEstimateSum) { - gs.UMA = GGUFEstimateSumItem{ - RAM: e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Others, - VRAM: e.Offload.Compute.Sum(), - } - if !mmap { - gs.UMA.RAM += e.Load.Weight.Sum() - gs.UMA.VRAM += e.Offload.Weight.Sum() - } - gs.NonUMA = GGUFEstimateSumItem{ - RAM: e.Load.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Sum(), - VRAM: e.Offload.KVCache.Sum() + e.Offload.Compute.Sum(), +func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummery) { + // UMA. + { + es.UMA = e.RAM.Footprint + switch kv := e.RAM.KVCache.Sum() + e.VRAM.KVCache.Sum(); { + case e.OffloadLayers == 0: + cp := e.RAM.Computation.Sum() + es.UMA += max(kv, cp) + case e.Layers == e.OffloadLayers: + cp := e.VRAM.Computation.Sum() + es.UMA += max(kv, cp) + default: + es.UMA += max(kv, max(e.RAM.Computation.Sum(), e.VRAM.Computation.Sum())) + } + if !mmap { + es.UMA += e.RAM.Weight.Sum() + } } - if !mmap { - gs.NonUMA.RAM += e.Load.Weight.Sum() - gs.NonUMA.VRAM += e.Offload.Weight.Sum() + + // NonUMA. + { + es.NonUMA.RAM = e.RAM.Footprint + e.RAM.KVCache.Sum() + e.RAM.Computation.Sum() + if !mmap && e.Layers != e.OffloadLayers { + es.NonUMA.RAM += e.RAM.Weight.Sum() + } + es.NonUMA.VRAM = e.VRAM.Footprint + e.VRAM.Weight.Sum() + e.VRAM.KVCache.Sum() + e.VRAM.Computation.Sum() } - return gs + + return es } -func (w GGUFWeightUsage) Sum() GGUFBytesScalar { - return w.Compute + w.Input + w.Output +func (u LLaMACppWeightUsage) Sum() GGUFBytesScalar { + return u.Input + u.Compute + u.Output } -func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar { - return c.Key + c.Value +func (u LLaMACppKVCacheUsage) Sum() GGUFBytesScalar { + return u.Key + u.Value } -func (c GGUFComputeUsage) Sum() GGUFBytesScalar { - return c.Graph + c.Others +func (u LLaMACppComputationUsage) Sum() GGUFBytesScalar { + r := u.Input + u.Compute + if r < u.Output { + r = u.Output + } + return u.Footprint + r } diff --git a/file_estimate_option.go b/file_estimate_option.go index 5fb93ef..8fdfdbc 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -5,7 +5,7 @@ import ( ) type ( - _GGUFEstimateOptions struct { + _LLaMACppUsageEstimateOptions struct { ContextSize *int32 ParallelSize *int32 BatchSize *int32 @@ -13,12 +13,12 @@ type ( CacheValueType *GGMLType OffloadLayers *uint64 } - GGUFEstimateOption func(*_GGUFEstimateOptions) + LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions) ) // WithContextSize sets the context size for the estimate. -func WithContextSize(size int32) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithContextSize(size int32) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { if size <= 0 { return } @@ -27,8 +27,8 @@ func WithContextSize(size int32) GGUFEstimateOption { } // WithParallelSize sets the (decoding sequences) parallel size for the estimate. -func WithParallelSize(size int32) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithParallelSize(size int32) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { if size <= 0 { return } @@ -37,8 +37,8 @@ func WithParallelSize(size int32) GGUFEstimateOption { } // WithBatchSize sets the physical batch size for the estimate. -func WithBatchSize(size int32) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithBatchSize(size int32) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { if size <= 0 { return } @@ -57,8 +57,8 @@ var _GGUFEstimateCacheTypeAllowList = []GGMLType{ } // WithCacheKeyType sets the cache key type for the estimate. -func WithCacheKeyType(t GGMLType) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithCacheKeyType(t GGMLType) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) { o.CacheKeyType = &t } @@ -66,8 +66,8 @@ func WithCacheKeyType(t GGMLType) GGUFEstimateOption { } // WithCacheValueType sets the cache value type for the estimate. -func WithCacheValueType(t GGMLType) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithCacheValueType(t GGMLType) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) { o.CacheValueType = &t } @@ -75,8 +75,8 @@ func WithCacheValueType(t GGMLType) GGUFEstimateOption { } // WithOffloadLayers sets the number of layers to offload. -func WithOffloadLayers(layers uint64) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { +func WithOffloadLayers(layers uint64) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { o.OffloadLayers = &layers } } diff --git a/file_estimate_test.go b/file_estimate_test.go index 4d98e5e..d41d903 100644 --- a/file_estimate_test.go +++ b/file_estimate_test.go @@ -7,7 +7,7 @@ import ( "github.com/davecgh/go-spew/spew" ) -func TestGGUFFile_Estimate(t *testing.T) { +func TestGGUFFile_EstimateLLaMACppUsage(t *testing.T) { ctx := context.Background() cases := []struct { @@ -60,12 +60,12 @@ func TestGGUFFile_Estimate(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { f := tc.given - t.Log("\n", spew.Sdump(f.Estimate()), "\n") + t.Log("\n", spew.Sdump(f.EstimateLLaMACppUsage()), "\n") }) } } -func TestGGUFFile_Estimate_ContextSize(t *testing.T) { +func TestGGUFFile_EstimateLLaMACppUsage_ContextSize(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( @@ -80,21 +80,21 @@ func TestGGUFFile_Estimate_ContextSize(t *testing.T) { cases := []struct { name string - opts []GGUFEstimateOption + opts []LLaMACppUsageEstimateOption }{ - {"1024(fp16)", []GGUFEstimateOption{WithContextSize(1024)}}, - {"1024(fp32)", []GGUFEstimateOption{WithContextSize(1024), WithCacheKeyType(GGMLTypeF32), WithCacheValueType(GGMLTypeF32)}}, - {"4096(fp16)", []GGUFEstimateOption{WithContextSize(4096)}}, - {"4096(fp32)", []GGUFEstimateOption{WithContextSize(4096), WithCacheKeyType(GGMLTypeF32), WithCacheValueType(GGMLTypeF32)}}, + {"1024(fp16)", []LLaMACppUsageEstimateOption{WithContextSize(1024)}}, + {"1024(fp32)", []LLaMACppUsageEstimateOption{WithContextSize(1024), WithCacheKeyType(GGMLTypeF32), WithCacheValueType(GGMLTypeF32)}}, + {"4096(fp16)", []LLaMACppUsageEstimateOption{WithContextSize(4096)}}, + {"4096(fp32)", []LLaMACppUsageEstimateOption{WithContextSize(4096), WithCacheKeyType(GGMLTypeF32), WithCacheValueType(GGMLTypeF32)}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n") + t.Log("\n", spew.Sdump(f.EstimateLLaMACppUsage(tc.opts...)), "\n") }) } } -func TestGGUFFile_Estimate_OffloadLayers(t *testing.T) { +func TestGGUFFile_EstimateLLaMACppUsage_OffloadLayers(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( @@ -109,17 +109,17 @@ func TestGGUFFile_Estimate_OffloadLayers(t *testing.T) { cases := []struct { name string - opts []GGUFEstimateOption + opts []LLaMACppUsageEstimateOption }{ - {"offload 0 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(0)}}, - {"offload 1 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(1)}}, - {"offload 10 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(10)}}, - {"offload all layers", []GGUFEstimateOption{WithContextSize(512)}}, - {"offload 33 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(33)}}, // exceeds the number of layers + {"offload 0 layer", []LLaMACppUsageEstimateOption{WithOffloadLayers(0)}}, + {"offload 1 layer", []LLaMACppUsageEstimateOption{WithOffloadLayers(1)}}, + {"offload 10 layers", []LLaMACppUsageEstimateOption{WithOffloadLayers(10)}}, + {"offload all layers", []LLaMACppUsageEstimateOption{}}, + {"offload 33 layers", []LLaMACppUsageEstimateOption{WithOffloadLayers(33)}}, // exceeds the number of layers } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n") + t.Log("\n", spew.Sdump(f.EstimateLLaMACppUsage(tc.opts...)), "\n") }) } }