From b3d65a46dc92525325e17c248d6f315cbc793408 Mon Sep 17 00:00:00 2001 From: thxCode Date: Fri, 7 Jun 2024 19:08:17 +0800 Subject: [PATCH] refactor: simplify estimate Signed-off-by: thxCode --- README.md | 6 -- cmd/gguf-parser/main.go | 54 ++++------ file.go | 162 +++++++----------------------- file_architecture.go | 6 ++ file_architecture_test.go | 2 +- file_estimate.go | 143 +++++++++++++-------------- file_estimate_option.go | 33 ++++--- file_estimate_test.go | 35 +------ file_model_test.go | 2 +- file_tokenizer_test.go | 2 +- ggml.go | 203 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 363 insertions(+), 285 deletions(-) create mode 100644 ggml.go diff --git a/README.md b/README.md index 4182db1..84013d1 100644 --- a/README.md +++ b/README.md @@ -121,12 +121,6 @@ spew.Dump(f.Estimate(WithContextSize(4096) /* 4K */)) ``` -#### Estimate with specific offload layers - -```go -spew.Dump(f.Estimate(WithOffloadLayers(10))) -``` - ## License MIT diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 3afa85d..9373f4f 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -32,9 +32,8 @@ func main() { skipProxy bool skipTLS bool // estimate options - ctxSize = 512 - kvType = "f16" - offloadLayers uint64 + ctxSize = 512 + kvType = "f16" // output options version bool skipModel bool @@ -65,7 +64,6 @@ func main() { fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL") fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage") fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]") - fs.Uint64Var(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, default is fully offloading") fs.BoolVar(&version, "version", version, "Show version") fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata") fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata") @@ -129,9 +127,6 @@ func main() { } eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv)) } - if offloadLayers > 0 { - eopts = append(eopts, WithOffloadLayers(offloadLayers)) - } // Parse GGUF file. @@ -206,23 +201,23 @@ func main() { if !skipModel { tprintf( "MODEL", - []string{"Name", "Architecture", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, + []string{"Name", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, []string{ m.Name, m.Architecture, sprintf(m.QuantizationVersion), sprintf(m.FileType), sprintf(m.LittleEndian), - m.Size.String(), - m.Parameters.String(), - m.BitsPerWeight.String(), + sprintf(m.Size), + sprintf(m.Parameters), + sprintf(m.BitsPerWeight), }) } if !skipArchitecture { tprintf( "ARCHITECTURE", - []string{"Max Context Length", "Embedding Length", "Layers", "Feed Forward Length", "Expert Count", "Vocabulary Length"}, + []string{"Max Context Len", "Embedding Len", "Layers", "Feed Forward Len", "Expert Cnt", "Vocabulary Len"}, []string{ sprintf(a.MaximumContextLength), sprintf(a.EmbeddingLength), @@ -242,7 +237,7 @@ func main() { } tprintf( "TOKENIZER", - []string{"Model", "Tokens Length", "Added Tokens Length", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, + []string{"Model", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, []string{ t.Model, sprintf(t.TokensLength), @@ -256,30 +251,17 @@ func main() { } if !skipEstimate { - bs := [][]string{ - { - "TOTAL", - sprintf(ctxSize), - e.Total.KVCache.Sum().String(), - e.Total.Compute.String(), - e.Total.IO.String(), - e.Total.Sum().String(), - }, - } - if e.Offload != nil { - bs = append(bs, []string{ - "OFFLOAD", - sprintf(ctxSize), - e.Offload.KVCache.Sum().String(), - e.Offload.Compute.String(), - e.Offload.IO.String(), - e.Offload.Sum().String(), - }) - } tprintf( "ESTIMATE", - []string{"/", "Context Length", "KV Cache", "Compute Memory", "IO Memory", "Sum"}, - bs...) + []string{"Context Size", "Model Weight", "KV Cache", "Computation Graph Overhead", "Others", "Usage (w/o MMap)"}, + []string{ + sprintf(ctxSize), + sprintf(e.ModelWeight), + sprintf(e.KVCache.Sum()), + sprintf(e.ComputationGraphOverhead), + sprintf(e.Others), + sprintf(e.Sum()) + " (" + sprintf(e.Sum()+e.ModelWeight) + ")", + }) } } @@ -323,7 +305,7 @@ func tprintf(title string, header []string, body ...[]string) { tb.SetAlignment(tablewriter.ALIGN_CENTER) tb.SetHeaderLine(true) tb.SetRowLine(true) - tb.SetAutoMergeCells(true) + tb.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3}) tb.Append(append([]string{title}, header...)) for i := range body { tb.Append(append([]string{title}, body[i]...)) diff --git a/file.go b/file.go index c9a6ffb..1e3622d 100644 --- a/file.go +++ b/file.go @@ -165,59 +165,6 @@ type ( GGUFMetadataKVs []GGUFMetadataKV ) -// Types for GGMLType. -type ( - // GGMLType is a type of GGML tensor, - // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure. - GGMLType uint32 - - // GGMLTypeTrait holds the trait of a GGMLType, - // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L564-L918. - GGMLTypeTrait struct { - BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64. - TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64. - Quantized bool - } -) - -// GGMLType constants. -// -// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated. -const ( - GGMLTypeF32 GGMLType = iota - GGMLTypeF16 - GGMLTypeQ4_0 - GGMLTypeQ4_1 - GGMLTypeQ4_2 - GGMLTypeQ4_3 - GGMLTypeQ5_0 - GGMLTypeQ5_1 - GGMLTypeQ8_0 - GGMLTypeQ8_1 - GGMLTypeQ2_K - GGMLTypeQ3_K - GGMLTypeQ4_K - GGMLTypeQ5_K - GGMLTypeQ6_K - GGMLTypeQ8_K - GGMLTypeIQ2_XXS - GGMLTypeIQ2_XS - GGMLTypeIQ3_XXS - GGMLTypeIQ1_S - GGMLTypeIQ4_NL - GGMLTypeIQ3_S - GGMLTypeIQ2_S - GGMLTypeIQ4_XS - GGMLTypeI8 - GGMLTypeI16 - GGMLTypeI32 - GGMLTypeI64 - GGMLTypeF64 - GGMLTypeIQ1_M - GGMLTypeBF16 - _GGMLTypeCount // Unknown -) - // Types for GGUFTensorInfo. type ( // GGUFTensorInfo represents a tensor info in a GGUF file. @@ -458,7 +405,8 @@ func parseGGUFFile(s int64, f io.ReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, e // Types for GGUF hierarchical tensors. type ( - // IGGUFTensorInfos is an interface for GGUFTensorInfos. + // IGGUFTensorInfos is an interface for GGUF tensor infos, + // which includes basic operations. IGGUFTensorInfos interface { // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. @@ -468,10 +416,12 @@ type ( // Index returns a map value to the GGUFTensorInfo with the given names, // and the number of names found. Index(names []string) (infos map[string]GGUFTensorInfo, found int) - // Elements returns the number of elements of the GGUFTensorInfo. + // Elements returns the number of elements(parameters). Elements() uint64 - // Bytes returns the number of bytes of the GGUFTensorInfo. + // Bytes returns the number of bytes. Bytes() uint64 + // Count returns the number of tensors. + Count() uint64 } // GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file, @@ -496,7 +446,16 @@ type ( ) // Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos. -func (gf *GGUFFile) Layers() GGUFLayerTensorInfos { +func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos { + ls := gf.layers() + if len(ignores) != 0 { + _, ls, _ = ls.Cut(ignores) + return ls + } + return ls +} + +func (gf *GGUFFile) layers() GGUFLayerTensorInfos { var ret GGUFLayerTensorInfos pm := make(map[string]any) @@ -921,73 +880,6 @@ func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataK return values, found } -// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType. -var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{ - GGMLTypeF32: {BlockSize: 1, TypeSize: 4}, - GGMLTypeF16: {BlockSize: 1, TypeSize: 2}, - GGMLTypeQ4_0: {BlockSize: 32, TypeSize: 18, Quantized: true}, - GGMLTypeQ4_1: {BlockSize: 32, TypeSize: 20, Quantized: true}, - GGMLTypeQ4_2: {BlockSize: 0, TypeSize: 0}, // Deprecated - GGMLTypeQ4_3: {BlockSize: 0, TypeSize: 0}, // Deprecated - GGMLTypeQ5_0: {BlockSize: 32, TypeSize: 22, Quantized: true}, - GGMLTypeQ5_1: {BlockSize: 32, TypeSize: 24, Quantized: true}, - GGMLTypeQ8_0: {BlockSize: 32, TypeSize: 34, Quantized: true}, - GGMLTypeQ8_1: {BlockSize: 32, TypeSize: 36, Quantized: true}, - GGMLTypeQ2_K: {BlockSize: 256, TypeSize: 84, Quantized: true}, - GGMLTypeQ3_K: {BlockSize: 256, TypeSize: 110, Quantized: true}, - GGMLTypeQ4_K: {BlockSize: 256, TypeSize: 144, Quantized: true}, - GGMLTypeQ5_K: {BlockSize: 256, TypeSize: 176, Quantized: true}, - GGMLTypeQ6_K: {BlockSize: 256, TypeSize: 210, Quantized: true}, - GGMLTypeQ8_K: {BlockSize: 256, TypeSize: 292, Quantized: true}, - GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true}, - GGMLTypeIQ2_XS: {BlockSize: 256, TypeSize: 74, Quantized: true}, - GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true}, - GGMLTypeIQ1_S: {BlockSize: 256, TypeSize: 50, Quantized: true}, - GGMLTypeIQ4_NL: {BlockSize: 32, TypeSize: 18, Quantized: true}, - GGMLTypeIQ3_S: {BlockSize: 256, TypeSize: 110, Quantized: true}, - GGMLTypeIQ2_S: {BlockSize: 256, TypeSize: 82, Quantized: true}, - GGMLTypeIQ4_XS: {BlockSize: 256, TypeSize: 136, Quantized: true}, - GGMLTypeI8: {BlockSize: 1, TypeSize: 1}, - GGMLTypeI16: {BlockSize: 1, TypeSize: 2}, - GGMLTypeI32: {BlockSize: 1, TypeSize: 4}, - GGMLTypeI64: {BlockSize: 1, TypeSize: 8}, - GGMLTypeF64: {BlockSize: 1, TypeSize: 8}, - GGMLTypeIQ1_M: {BlockSize: 256, TypeSize: 56, Quantized: true}, - GGMLTypeBF16: {BlockSize: 1, TypeSize: 2}, -} - -// Trait returns the GGMLTypeTrait of the GGMLType. -func (t GGMLType) Trait() (GGMLTypeTrait, bool) { - tt, ok := _GGMLTypeTraits[t] - return tt, ok -} - -// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait, -// which is inspired by -// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145. -// -// The index of the given dimensions means the number of dimension, -// i.e. 0 is the first dimension, 1 is the second dimension, and so on. -// -// The value of the item is the number of elements in the corresponding dimension. -func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 { - if len(dimensions) == 0 { - panic(errors.New("no dimensions")) - } - - tt, ok := t.Trait() - if !ok { - panic(fmt.Errorf("invalid type: %v", t)) - } - - // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643 - ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size - for i := 1; i < len(dimensions); i++ { - ds *= dimensions[i] - } - return ds -} - // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool) { @@ -1071,6 +963,12 @@ func (ti GGUFTensorInfo) Bytes() uint64 { return ret } +// Count returns the number of GGUF tensors of the GGUFTensorInfo, +// which is always 1. +func (ti GGUFTensorInfo) Count() uint64 { + return 1 +} + // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) { @@ -1130,7 +1028,12 @@ func (tis GGUFTensorInfos) Bytes() uint64 { return ret } -// Get returns the GGUFTensorInfo with the given name, +// Count returns the number of GGUF tensors of the GGUFTensorInfos. +func (tis GGUFTensorInfos) Count() uint64 { + return uint64(len(tis)) +} + +// Get returns the IGGUFTensorInfos with the given name, // and true if found, and false otherwise. func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) { for i := range ltis { @@ -1211,6 +1114,15 @@ func (ltis GGUFLayerTensorInfos) Bytes() uint64 { return ret } +// Count returns the number of GGUF tensors of the GGUFLayerTensorInfos. +func (ltis GGUFLayerTensorInfos) Count() uint64 { + var ret uint64 + for i := range ltis { + ret += ltis[i].Count() + } + return ret +} + // Cut splits the GGUFLayerTensorInfos into two parts, // and returns the GGUFLayerTensorInfos with the names that match the given names at first, // and the GGUFLayerTensorInfos without the names at second, diff --git a/file_architecture.go b/file_architecture.go index 8e1dd31..09c60b7 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -2,6 +2,10 @@ package gguf_parser // GGUFArchitectureMetadata represents the architecture metadata of a GGUF file. type GGUFArchitectureMetadata struct { + // Architecture describes what architecture this model implements. + // + // All lowercase ASCII, with only [a-z0-9]+ characters allowed. + Architecture string `json:"architecture"` // MaximumContextLength(n_ctx_train) is the maximum context length of the model. // // For most architectures, this is the hard limit on the length of the input. @@ -114,6 +118,8 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { tokenizerGGMLTokensKey = "tokenizer.ggml.tokens" ) + ga.Architecture = arch + m, _ := gf.Header.MetadataKV.Index([]string{ contextLengthKey, embeddingLengthKey, diff --git a/file_architecture_test.go b/file_architecture_test.go index 84fa433..b6b17a6 100644 --- a/file_architecture_test.go +++ b/file_architecture_test.go @@ -14,7 +14,7 @@ func TestGGUFFile_Architecture(t *testing.T) { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) diff --git a/file_estimate.go b/file_estimate.go index 366aa85..186f02b 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -1,22 +1,20 @@ package gguf_parser -// GGUFEstimate represents the estimated result of the GGUF file. -type GGUFEstimate struct { - // Offload is the offloaded layers usage. - Offload *GGUFMemoryUsage `json:"offload,omitempty"` - // Total is the total memory usage. - Total GGUFMemoryUsage `json:"total"` -} +import ( + "github.com/thxcode/gguf-parser-go/util/ptr" +) type ( - // GGUFMemoryUsage represents the memory usage of the GGUF file. - GGUFMemoryUsage struct { + // GGUFEstimate represents the estimated result of the GGUF file. + GGUFEstimate struct { + // ModelWeight is the memory usage of model weight. + ModelWeight GGUFBytesScalar `json:"modelWeight"` // KVCache is the usage of key-value cache. KVCache GGUFKVCacheUsage `json:"kvCache"` - // Compute is the usage of transformer layers. - Compute GGUFBytesScalar `json:"compute"` - // IO is the usage of input/output layers. - IO GGUFBytesScalar `json:"io"` + // ComputationGraphOverhead is the overhead of computation graph. + ComputationGraphOverhead GGUFBytesScalar `json:"computationGraphOverhead"` + // Others is the trivial usage. + Others GGUFBytesScalar `json:"others"` } // GGUFKVCacheUsage represents the usage of kv-cache. @@ -35,89 +33,90 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { opt(&o) } - ge.Offload, ge.Total = gf.estimateMemoryUsage(gf.Architecture(), o) - return ge -} - -func (m GGUFMemoryUsage) Sum() GGUFBytesScalar { - return m.Compute + m.KVCache.Sum() + m.IO -} + a := gf.Architecture() -func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar { - return c.Key + c.Value -} - -func (gf *GGUFFile) estimateMemoryUsage(a GGUFArchitectureMetadata, o _GGUFEstimateOptions) (offload *GGUFMemoryUsage, total GGUFMemoryUsage) { - if o.OffloadLayers != nil { - offload = &GGUFMemoryUsage{} + contextSize := a.MaximumContextLength + if o.ContextSize != nil { + contextSize = uint64(*o.ContextSize) } - // KV cache. - // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501 + // Model weight. + ge.ModelWeight = gf.ModelSize + + // KV cache, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. { kt, vt := GGMLTypeF16, GGMLTypeF16 - + kvSize := contextSize if o.CacheKeyType != nil { kt = *o.CacheKeyType } if o.CacheValueType != nil { vt = *o.CacheValueType } + if a.Architecture == "mamba" { + // See https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. + kt, vt = GGMLTypeF32, GGMLTypeF32 + kvSize = uint64(ptr.Deref(o.ParallelSize, 1)) + } var ( embedKeyGQA = uint64(a.AttentionKeyLength) * a.AttentionHeadCountKV embedValGQA = uint64(a.AttentionValueLength) * a.AttentionHeadCountKV - kvSize = a.MaximumContextLength ) - { - // Correct. - if a.SSMConvolutionKernel > 0 { - embedKeyGQA += uint64(a.SSMConvolutionKernel - 1*a.SSMInnerSize) - embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize) - } - if o.ContextSize != nil { - kvSize = uint64(*o.ContextSize) - } + if a.SSMConvolutionKernel > 0 { + embedKeyGQA += uint64(a.SSMConvolutionKernel - 1*a.SSMInnerSize) + embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize) } krs := kt.RowSizeOf([]uint64{embedKeyGQA * kvSize}) vrs := vt.RowSizeOf([]uint64{embedValGQA * kvSize}) - if offload != nil { - v := *o.OffloadLayers - if v > a.BlockCount { - v = a.BlockCount - } - offload.KVCache.Key = GGUFBytesScalar(krs * v) - offload.KVCache.Value = GGUFBytesScalar(vrs * v) - } + ge.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount) + ge.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount) + } - total.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount) - total.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount) + // Others. + { + // Overhead + ge.Others += GGUFBytesScalar(15 * 1024 * 1024) // NB(thxCode): Magic here. + + // GGML context, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. + ggmlCtx := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) + ge.Others += GGUFBytesScalar(ggmlCtx) + + // Output buffer, + // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. + outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * uint64(ptr.Deref(o.ParallelSize, 1)) + ge.Others += GGUFBytesScalar(outBuffer) } - ls := gf.Layers() - bls, als, _ := ls.Cut([]string{ - "token_embd.weight", - "output.weight", - "output_norm.weight", - }) - - // IO. - total.IO = GGUFBytesScalar(bls.Bytes()) - - // Compute. - if offload != nil { - v := *o.OffloadLayers - if v >= a.BlockCount { - offload.Compute = GGUFBytesScalar(als.Bytes()) - } else { - for i := uint64(len(als) - 1); i >= uint64(len(als))-v; i-- { - offload.Compute += GGUFBytesScalar(als[i].Bytes()) - } - } + // Computation graph. + { + graphOverhead := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + + GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) + ge.ComputationGraphOverhead += GGUFBytesScalar(graphOverhead) + + var ( + nBatch = min(contextSize, uint64(ptr.Deref(o.BatchSize, 512))) + + inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] + inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch] + inpPos = GGMLTypeI32.RowSizeOf([]uint64{contextSize}) // I32 [n_tokens] + inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{contextSize}) // I32 [n_output], + inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{contextSize, nBatch}) // F32 [n_kv, n_batch] + ) + ge.ComputationGraphOverhead += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) } - total.Compute = GGUFBytesScalar(als.Bytes()) - return offload, total + return ge +} + +func (e GGUFEstimate) Sum() GGUFBytesScalar { + return e.KVCache.Sum() + e.ComputationGraphOverhead + e.Others +} + +func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar { + return c.Key + c.Value } diff --git a/file_estimate_option.go b/file_estimate_option.go index d63ef41..80f2b10 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -7,9 +7,10 @@ import ( type ( _GGUFEstimateOptions struct { ContextSize *int32 + ParallelSize *int32 + BatchSize *int32 CacheKeyType *GGMLType CacheValueType *GGMLType - OffloadLayers *uint64 } GGUFEstimateOption func(*_GGUFEstimateOptions) ) @@ -24,6 +25,26 @@ func WithContextSize(size int32) GGUFEstimateOption { } } +// WithParallelSize sets the (decoding sequences) parallel size for the estimate. +func WithParallelSize(size int32) GGUFEstimateOption { + return func(o *_GGUFEstimateOptions) { + if size <= 0 { + return + } + o.ParallelSize = &size + } +} + +// WithBatchSize sets the physical batch size for the estimate. +func WithBatchSize(size int32) GGUFEstimateOption { + return func(o *_GGUFEstimateOptions) { + if size <= 0 { + return + } + o.BatchSize = &size + } +} + // _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types. var _GGUFEstimateCacheTypeAllowList = []GGMLType{ GGMLTypeF32, @@ -51,13 +72,3 @@ func WithCacheValueType(t GGMLType) GGUFEstimateOption { } } } - -// WithOffloadLayers sets the number of layers to offload. -func WithOffloadLayers(layers uint64) GGUFEstimateOption { - return func(o *_GGUFEstimateOptions) { - if layers <= 0 { - return - } - o.OffloadLayers = &layers - } -} diff --git a/file_estimate_test.go b/file_estimate_test.go index d922174..5f7380f 100644 --- a/file_estimate_test.go +++ b/file_estimate_test.go @@ -20,7 +20,7 @@ func TestGGUFFile_Estimate(t *testing.T) { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) @@ -65,13 +65,13 @@ func TestGGUFFile_Estimate(t *testing.T) { } } -func TestGGUFFile_Estimate_KVCache(t *testing.T) { +func TestGGUFFile_Estimate_ContextSize(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) @@ -93,32 +93,3 @@ func TestGGUFFile_Estimate_KVCache(t *testing.T) { }) } } - -func TestGGUFFile_Estimate_Offload(t *testing.T) { - ctx := context.Background() - - f, err := ParseGGUFFileFromHuggingFace( - ctx, - "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", - SkipLargeMetadata()) - if err != nil { - t.Fatal(err) - return - } - - cases := []struct { - name string - opts []GGUFEstimateOption - }{ - {"offload 0 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(0)}}, - {"offload 1 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(1)}}, - {"offload 10 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(10)}}, - {"offload 33 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(33)}}, // exceeds the number of layers - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n") - }) - } -} diff --git a/file_model_test.go b/file_model_test.go index c52019a..5400944 100644 --- a/file_model_test.go +++ b/file_model_test.go @@ -16,7 +16,7 @@ func TestGGUFFile_Model(t *testing.T) { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) diff --git a/file_tokenizer_test.go b/file_tokenizer_test.go index 39cdf44..26ba749 100644 --- a/file_tokenizer_test.go +++ b/file_tokenizer_test.go @@ -14,7 +14,7 @@ func TestGGUFFile_Tokenizer(t *testing.T) { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", - "Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) diff --git a/ggml.go b/ggml.go new file mode 100644 index 0000000..d438589 --- /dev/null +++ b/ggml.go @@ -0,0 +1,203 @@ +package gguf_parser + +import ( + "errors" + "fmt" + "slices" +) + +// Types for GGMLType. +type ( + // GGMLType is a type of GGML tensor, + // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure. + GGMLType uint32 + + // GGMLTypeTrait holds the trait of a GGMLType, + // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L564-L918. + GGMLTypeTrait struct { + BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64. + TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64. + Quantized bool + } +) + +// GGMLType constants. +// +// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated. +const ( + GGMLTypeF32 GGMLType = iota + GGMLTypeF16 + GGMLTypeQ4_0 + GGMLTypeQ4_1 + GGMLTypeQ4_2 + GGMLTypeQ4_3 + GGMLTypeQ5_0 + GGMLTypeQ5_1 + GGMLTypeQ8_0 + GGMLTypeQ8_1 + GGMLTypeQ2_K + GGMLTypeQ3_K + GGMLTypeQ4_K + GGMLTypeQ5_K + GGMLTypeQ6_K + GGMLTypeQ8_K + GGMLTypeIQ2_XXS + GGMLTypeIQ2_XS + GGMLTypeIQ3_XXS + GGMLTypeIQ1_S + GGMLTypeIQ4_NL + GGMLTypeIQ3_S + GGMLTypeIQ2_S + GGMLTypeIQ4_XS + GGMLTypeI8 + GGMLTypeI16 + GGMLTypeI32 + GGMLTypeI64 + GGMLTypeF64 + GGMLTypeIQ1_M + GGMLTypeBF16 + _GGMLTypeCount // Unknown +) + +// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType. +var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{ + GGMLTypeF32: {BlockSize: 1, TypeSize: 4}, + GGMLTypeF16: {BlockSize: 1, TypeSize: 2}, + GGMLTypeQ4_0: {BlockSize: 32, TypeSize: 18, Quantized: true}, + GGMLTypeQ4_1: {BlockSize: 32, TypeSize: 20, Quantized: true}, + GGMLTypeQ4_2: {BlockSize: 0, TypeSize: 0}, // Deprecated + GGMLTypeQ4_3: {BlockSize: 0, TypeSize: 0}, // Deprecated + GGMLTypeQ5_0: {BlockSize: 32, TypeSize: 22, Quantized: true}, + GGMLTypeQ5_1: {BlockSize: 32, TypeSize: 24, Quantized: true}, + GGMLTypeQ8_0: {BlockSize: 32, TypeSize: 34, Quantized: true}, + GGMLTypeQ8_1: {BlockSize: 32, TypeSize: 36, Quantized: true}, + GGMLTypeQ2_K: {BlockSize: 256, TypeSize: 84, Quantized: true}, + GGMLTypeQ3_K: {BlockSize: 256, TypeSize: 110, Quantized: true}, + GGMLTypeQ4_K: {BlockSize: 256, TypeSize: 144, Quantized: true}, + GGMLTypeQ5_K: {BlockSize: 256, TypeSize: 176, Quantized: true}, + GGMLTypeQ6_K: {BlockSize: 256, TypeSize: 210, Quantized: true}, + GGMLTypeQ8_K: {BlockSize: 256, TypeSize: 292, Quantized: true}, + GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true}, + GGMLTypeIQ2_XS: {BlockSize: 256, TypeSize: 74, Quantized: true}, + GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true}, + GGMLTypeIQ1_S: {BlockSize: 256, TypeSize: 50, Quantized: true}, + GGMLTypeIQ4_NL: {BlockSize: 32, TypeSize: 18, Quantized: true}, + GGMLTypeIQ3_S: {BlockSize: 256, TypeSize: 110, Quantized: true}, + GGMLTypeIQ2_S: {BlockSize: 256, TypeSize: 82, Quantized: true}, + GGMLTypeIQ4_XS: {BlockSize: 256, TypeSize: 136, Quantized: true}, + GGMLTypeI8: {BlockSize: 1, TypeSize: 1}, + GGMLTypeI16: {BlockSize: 1, TypeSize: 2}, + GGMLTypeI32: {BlockSize: 1, TypeSize: 4}, + GGMLTypeI64: {BlockSize: 1, TypeSize: 8}, + GGMLTypeF64: {BlockSize: 1, TypeSize: 8}, + GGMLTypeIQ1_M: {BlockSize: 256, TypeSize: 56, Quantized: true}, + GGMLTypeBF16: {BlockSize: 1, TypeSize: 2}, +} + +// Trait returns the GGMLTypeTrait of the GGMLType. +func (t GGMLType) Trait() (GGMLTypeTrait, bool) { + tt, ok := _GGMLTypeTraits[t] + return tt, ok +} + +// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait, +// which is inspired by +// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145. +// +// The index of the given dimensions means the number of dimension, +// i.e. 0 is the first dimension, 1 is the second dimension, and so on. +// +// The value of the item is the number of elements in the corresponding dimension. +func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 { + if len(dimensions) == 0 { + panic(errors.New("no dimensions")) + } + + tt, ok := t.Trait() + if !ok { + panic(fmt.Errorf("invalid type: %v", t)) + } + + // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643 + ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size + for i := 1; i < len(dimensions); i++ { + ds *= dimensions[i] + } + return ds +} + +// GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding, +// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255. +func GGMLMemoryPadding(size uint64) uint64 { + // https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243 + const align = 16 + return (size + align - 1) &^ (align - 1) +} + +// GGML tensor constants. +const ( + // GGMLTensorSize is the size of GGML tensor in bytes, + // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606. + GGMLTensorSize = 368 + + // GGMLObjectSize is the size of GGML object in bytes, + // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563. + GGMLObjectSize = 32 +) + +// GGMLTensorOverhead is the overhead of GGML tensor in bytes, +// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767. +func GGMLTensorOverhead() uint64 { + return GGMLObjectSize + GGMLTensorSize +} + +// GGML computation graph constants. +const ( + // GGMLComputationGraphSize is the size of GGML computation graph in bytes. + GGMLComputationGraphSize = 80 + + // GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph, + // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103. + GGMLComputationGraphNodesMaximum = 8192 + + // GGMLComputationGraphNodesDefault is the default nodes of the computation graph, + // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237. + GGMLComputationGraphNodesDefault = 2048 +) + +// GGMLComputationGraphOverhead is the overhead of GGML graph in bytes, +// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917. +func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 { + const pointerSize = 8 + + var g uint64 = GGMLComputationGraphSize + g += pointerSize * nodes * 2 + if grads { + g += pointerSize * nodes + } + g += pointerSize * GGMLHashSize(nodes) + + return GGMLObjectSize + GGMLMemoryPadding(g) +} + +// GGMLHashSize returns the size of the hash table for the given base, +// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722. +func GGMLHashSize(base uint64) uint64 { + primes := []uint64{ + 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, + 2053, 4099, 8209, 16411, 32771, 65537, 131101, + 262147, 524309, 1048583, 2097169, 4194319, 8388617, + 16777259, 33554467, 67108879, 134217757, 268435459, + 536870923, 1073741827, 2147483659, + } + i, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int { + if t >= e { + return 0 + } + return -1 + }) + if !ok { + return base | 1 + } + return primes[i] +}