diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 9373f4f..577a57c 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -28,12 +28,15 @@ func main() { repo, model string // read options debug bool - mmap = true skipProxy bool skipTLS bool // estimate options - ctxSize = 512 - kvType = "f16" + ctxSize = -1 + kvType = "f16" + offloadLayers = -1 + batchSize = 512 + parallel = 1 + noMMap bool // output options version bool skipModel bool @@ -59,11 +62,14 @@ func main() { fs.StringVar(&model, "model", model, "Model below the --repo, e.g. "+ "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf") fs.BoolVar(&debug, "debug", debug, "Debug mode") - fs.BoolVar(&mmap, "mmap", mmap, "Use mmap to read the local file") fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip using proxy when reading from a remote URL") fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL") - fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage") + fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage, default is equal to the model's maximum context size") fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]") + fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, default is fully offloading") + fs.IntVar(&batchSize, "batch-size", batchSize, "Physical maximum batch size") + fs.IntVar(¶llel, "parallel", parallel, "Number of parallel sequences to decode") + fs.BoolVar(&noMMap, "no-mmap", noMMap, "Do not use memory-mapping, which influences the estimate result") fs.BoolVar(&version, "version", version, "Show version") fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata") fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata") @@ -85,13 +91,11 @@ func main() { ropts := []GGUFReadOption{ SkipLargeMetadata(), + UseMMap(), } if debug { ropts = append(ropts, UseDebug()) } - if mmap { - ropts = append(ropts, UseMMap()) - } if skipProxy { ropts = append(ropts, SkipProxy()) } @@ -99,11 +103,12 @@ func main() { ropts = append(ropts, SkipTLSVerification()) } - if ctxSize <= 0 { - ctxSize = 512 - } eopts := []GGUFEstimateOption{ - WithContextSize(int32(ctxSize)), + WithCacheValueType(GGMLTypeF16), + WithCacheKeyType(GGMLTypeF16), + } + if ctxSize > 0 { + eopts = append(eopts, WithContextSize(int32(ctxSize))) } if kvType != "" { kv := GGMLTypeF16 @@ -127,6 +132,15 @@ func main() { } eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv)) } + if offloadLayers >= 0 { + eopts = append(eopts, WithOffloadLayers(uint64(offloadLayers))) + } + if batchSize > 0 { + eopts = append(eopts, WithBatchSize(int32(batchSize))) + } + if parallel > 0 { + eopts = append(eopts, WithParallelSize(int32(parallel))) + } // Parse GGUF file. @@ -183,7 +197,8 @@ func main() { o["tokenizer"] = t } if !skipEstimate { - o["estimate"] = e + es := e.Sum(!noMMap) + o["estimate"] = es } enc := stdjson.NewEncoder(os.Stdout) @@ -237,9 +252,10 @@ func main() { } tprintf( "TOKENIZER", - []string{"Model", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, + []string{"Model", "Tokens Size", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, []string{ t.Model, + sprintf(GGUFBytesScalar(t.TokensSize)), sprintf(t.TokensLength), sprintf(t.AddedTokensLength), sprintTokenID(t.BOSTokenID), @@ -251,16 +267,29 @@ func main() { } if !skipEstimate { + es := e.Sum(!noMMap) + if ctxSize <= 0 { + if a.MaximumContextLength == 0 { + a = gf.Architecture() + } + ctxSize = int(a.MaximumContextLength) + } tprintf( "ESTIMATE", - []string{"Context Size", "Model Weight", "KV Cache", "Computation Graph Overhead", "Others", "Usage (w/o MMap)"}, + []string{"Mem. Arch", "MMap", "Context Size", "(CPU) RAM", "(GPU) VRAM"}, + []string{ + "UMA", + sprintf(!noMMap), + sprintf(ctxSize), + sprintf(es.UMA.RAM), + sprintf(es.UMA.VRAM), + }, []string{ + "NonUMA", + sprintf(!noMMap), sprintf(ctxSize), - sprintf(e.ModelWeight), - sprintf(e.KVCache.Sum()), - sprintf(e.ComputationGraphOverhead), - sprintf(e.Others), - sprintf(e.Sum()) + " (" + sprintf(e.Sum()+e.ModelWeight) + ")", + sprintf(es.NonUMA.RAM), + sprintf(es.NonUMA.VRAM), }) } } diff --git a/file.go b/file.go index 1e3622d..612b5c0 100644 --- a/file.go +++ b/file.go @@ -159,6 +159,9 @@ type ( // // The offset is the start of the file. StartOffset int64 `json:"startOffset"` + + // Size is the size of the array in bytes. + Size int64 `json:"endOffset"` } // GGUFMetadataKVs is a list of GGUFMetadataKV. @@ -1286,6 +1289,11 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) { return v, fmt.Errorf("read array length: %w", err) } + itemStart, err := rd.f.Seek(0, io.SeekCurrent) + if err != nil { + return v, fmt.Errorf("seek array item start: %w", err) + } + if !rd.o.SkipLargeMetadata { v.Array = make([]any, v.Len) for i := uint64(0); i < v.Len; i++ { @@ -1295,6 +1303,12 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) { } } + itemEnd, err := rd.f.Seek(0, io.SeekCurrent) + if err != nil { + return v, fmt.Errorf("seek array item end: %w", err) + } + v.Size = itemEnd - itemStart + return v, nil } @@ -1321,6 +1335,12 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) { return v, fmt.Errorf("seek array end: %w", err) } + itemEnd, err := rd.f.Seek(0, io.SeekCurrent) + if err != nil { + return v, fmt.Errorf("seek array item end: %w", err) + } + v.Size = itemEnd - itemStart + return v, nil } diff --git a/file_estimate.go b/file_estimate.go index 186f02b..7a43727 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -1,20 +1,41 @@ package gguf_parser import ( + "regexp" + "strings" + "github.com/thxcode/gguf-parser-go/util/ptr" ) +// GGUFEstimate represents the estimated result of the GGUF file. +type GGUFEstimate struct { + // Load is the memory usage of the load part. + Load GGUFMemoryUsage `json:"load"` + // Offload is the memory usage of the offload part. + Offload GGUFMemoryUsage `json:"offload"` +} + type ( - // GGUFEstimate represents the estimated result of the GGUF file. - GGUFEstimate struct { - // ModelWeight is the memory usage of model weight. - ModelWeight GGUFBytesScalar `json:"modelWeight"` + // GGUFMemoryUsage represents the memory usage of the GGUF file. + GGUFMemoryUsage struct { + // Weight is the memory usage of weight. + Weight GGUFWeightUsage `json:"weight"` // KVCache is the usage of key-value cache. KVCache GGUFKVCacheUsage `json:"kvCache"` - // ComputationGraphOverhead is the overhead of computation graph. - ComputationGraphOverhead GGUFBytesScalar `json:"computationGraphOverhead"` - // Others is the trivial usage. - Others GGUFBytesScalar `json:"others"` + // Tokens is the memory usage of token. + Tokens GGUFBytesScalar `json:"tokens"` + // Compute is the memory usage of computation. + Compute GGUFComputeUsage `json:"compute"` + } + + // GGUFWeightUsage represents the memory usage of model weight. + GGUFWeightUsage struct { + // Compute is the memory usage of computing. + Compute GGUFBytesScalar `json:"compute"` + // Input is the memory usage of input. + Input GGUFBytesScalar `json:"input"` + // Output is the memory usage of output. + Output GGUFBytesScalar `json:"output"` } // GGUFKVCacheUsage represents the usage of kv-cache. @@ -24,6 +45,14 @@ type ( // Value is the memory usage of the cached value. Value GGUFBytesScalar `json:"value"` } + + // GGUFComputeUsage represents the memory usage of computation. + GGUFComputeUsage struct { + // Graph is the memory usage of computation graph. + Graph GGUFBytesScalar `json:"graph"` + // Others is the trivial usage. + Others GGUFBytesScalar `json:"others"` + } ) // Estimate returns the inference usage estimated result of the GGUF file. @@ -33,21 +62,69 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { opt(&o) } - a := gf.Architecture() + a, t := gf.Architecture(), gf.Tokenizer() - contextSize := a.MaximumContextLength + nContext := a.MaximumContextLength if o.ContextSize != nil { - contextSize = uint64(*o.ContextSize) + nContext = uint64(*o.ContextSize) + } + + var ( + nLoadLayers = a.BlockCount + nOffloadLayers uint64 + nBatch = min(nContext, uint64(ptr.Deref(o.BatchSize, 512))) + nParallel = uint64(ptr.Deref(o.ParallelSize, 1)) + ) + { + if v := o.OffloadLayers; v == nil { + o.OffloadLayers = ptr.To(a.BlockCount) + nOffloadLayers = nLoadLayers + } else if *v > 0 { + nOffloadLayers = *v + if nOffloadLayers > nLoadLayers { + nOffloadLayers = nLoadLayers + } + } + nLoadLayers -= nOffloadLayers } + ls := gf.Layers() + ioLs, tfLs, _ := ls.Cut([]string{ + "token_embd.weight", + "output.weight", + "output_norm.weight", + }) + // Model weight. - ge.ModelWeight = gf.ModelSize + { + // Compute. + for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ { + switch { + case i < nLoadLayers: + ge.Load.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + case i >= offloadStart: + ge.Offload.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + } + } + + // IO, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002. + inpLs, outLs, _ := ioLs.Cut([]string{ + "token_embd.weight", + }) + ge.Load.Weight.Input = GGUFBytesScalar(inpLs.Bytes()) + ge.Load.Weight.Output = GGUFBytesScalar(outLs.Bytes()) + if nOffloadLayers == a.BlockCount { + ge.Offload.Weight.Output = ge.Load.Weight.Output + ge.Load.Weight.Output = 0 + } + } // KV cache, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. { kt, vt := GGMLTypeF16, GGMLTypeF16 - kvSize := contextSize + nKV := nContext if o.CacheKeyType != nil { kt = *o.CacheKeyType } @@ -57,7 +134,7 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { if a.Architecture == "mamba" { // See https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. kt, vt = GGMLTypeF32, GGMLTypeF32 - kvSize = uint64(ptr.Deref(o.ParallelSize, 1)) + nKV = nParallel } var ( @@ -69,54 +146,139 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) { embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize) } - krs := kt.RowSizeOf([]uint64{embedKeyGQA * kvSize}) - vrs := vt.RowSizeOf([]uint64{embedValGQA * kvSize}) + krs := kt.RowSizeOf([]uint64{embedKeyGQA * nKV}) + vrs := vt.RowSizeOf([]uint64{embedValGQA * nKV}) - ge.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount) - ge.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount) + ge.Load.KVCache.Key = GGUFBytesScalar(krs * nLoadLayers) + ge.Load.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) + ge.Offload.KVCache.Key = GGUFBytesScalar(krs * nOffloadLayers) + ge.Offload.KVCache.Value = GGUFBytesScalar(vrs * nOffloadLayers) } - // Others. + // Tokens. + ge.Load.Tokens += GGUFBytesScalar(t.TokensSize) + ge.Load.Tokens += GGUFBytesScalar(t.TokensLength * (4 /* token type */ + 4 /* token score*/)) + if t.Model == "gpt2" { + ge.Load.Tokens += GGUFBytesScalar(t.MergesSize) + ge.Load.Tokens += GGUFBytesScalar(t.MergesLength * (48 /* key type */ + 56 /* value type */)) + } + + // Compute. { - // Overhead - ge.Others += GGUFBytesScalar(15 * 1024 * 1024) // NB(thxCode): Magic here. + // Bootstrap. + ge.Load.Compute.Others += GGUFBytesScalar(15 * 1024 * 1024) // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. ggmlCtx := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) - ge.Others += GGUFBytesScalar(ggmlCtx) + ge.Load.Compute.Others += GGUFBytesScalar(ggmlCtx) // Output buffer, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. - outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * uint64(ptr.Deref(o.ParallelSize, 1)) - ge.Others += GGUFBytesScalar(outBuffer) - } + outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel + ge.Load.Compute.Others += GGUFBytesScalar(outBuffer) - // Computation graph. - { + // Graph overhead. graphOverhead := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) - ge.ComputationGraphOverhead += GGUFBytesScalar(graphOverhead) + ge.Load.Compute.Others += GGUFBytesScalar(graphOverhead) + } + // Computation graph. + { + // Tensor usage, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. + // + // Firstly, get the usage of input tensors. var ( - nBatch = min(contextSize, uint64(ptr.Deref(o.BatchSize, 512))) - inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch] - inpPos = GGMLTypeI32.RowSizeOf([]uint64{contextSize}) // I32 [n_tokens] - inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{contextSize}) // I32 [n_output], - inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{contextSize, nBatch}) // F32 [n_kv, n_batch] + inpPos = GGMLTypeI32.RowSizeOf([]uint64{nContext}) // I32 [n_tokens] + inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nContext}) // I32 [n_output], + inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nContext, nBatch}) // F32 [n_kv, n_batch] ) - ge.ComputationGraphOverhead += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) + ge.Load.Compute.Graph += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) + if nOffloadLayers > 0 { + ge.Offload.Compute.Graph += GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) + } + // Since the steps between transformer layers are serial, + // the allocated memory can be reused for the next layer. + // So, we only consider the usage of the largest layer, + // which is the last layer by default. + kvcInc := uint64(ge.Load.KVCache.Key + ge.Offload.KVCache.Key) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + kvcInc += rs + if strings.HasSuffix(l.Name, ".attn_q.weight") { + kvcInc += rs // for RoPE + } + } + var ffnInc uint64 + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + ffnInc += rs + } + if nLoadLayers == a.BlockCount { + ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) + } else { + ge.Offload.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) + if nLoadLayers > 0 { + ffnInc = 0 + for _, l := range tfLs[nLoadLayers-1].Search(regexp.MustCompile(`.*\.\d+\.ffn_(norm|gate|up)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + ffnInc += rs + } + ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc)) + } + } } return ge } -func (e GGUFEstimate) Sum() GGUFBytesScalar { - return e.KVCache.Sum() + e.ComputationGraphOverhead + e.Others +type ( + GGUFEstimateSum struct { + // UMA is the usage of unified memory architecture. + UMA GGUFEstimateSumItem `json:"uma"` + // NonUMA is the usage of non-unified memory architecture. + NonUMA GGUFEstimateSumItem `json:"nonUMA"` + } + GGUFEstimateSumItem struct { + // RAM is the memory usage of the RAM. + RAM GGUFBytesScalar `json:"ram"` + // VRAM is the memory usage of the VRAM. + VRAM GGUFBytesScalar `json:"vram"` + } +) + +func (e GGUFEstimate) Sum(mmap bool) (gs GGUFEstimateSum) { + gs.UMA = GGUFEstimateSumItem{ + RAM: e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Others, + VRAM: e.Offload.Compute.Sum(), + } + if !mmap { + gs.UMA.RAM += e.Load.Weight.Sum() + gs.UMA.VRAM += e.Offload.Weight.Sum() + } + gs.NonUMA = GGUFEstimateSumItem{ + RAM: e.Load.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Sum(), + VRAM: e.Offload.KVCache.Sum() + e.Offload.Compute.Sum(), + } + if !mmap { + gs.NonUMA.RAM += e.Load.Weight.Sum() + gs.NonUMA.VRAM += e.Offload.Weight.Sum() + } + return gs +} + +func (w GGUFWeightUsage) Sum() GGUFBytesScalar { + return w.Compute + w.Input + w.Output } func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar { return c.Key + c.Value } + +func (c GGUFComputeUsage) Sum() GGUFBytesScalar { + return c.Graph + c.Others +} diff --git a/file_estimate_option.go b/file_estimate_option.go index 80f2b10..5fb93ef 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -11,6 +11,7 @@ type ( BatchSize *int32 CacheKeyType *GGMLType CacheValueType *GGMLType + OffloadLayers *uint64 } GGUFEstimateOption func(*_GGUFEstimateOptions) ) @@ -72,3 +73,10 @@ func WithCacheValueType(t GGMLType) GGUFEstimateOption { } } } + +// WithOffloadLayers sets the number of layers to offload. +func WithOffloadLayers(layers uint64) GGUFEstimateOption { + return func(o *_GGUFEstimateOptions) { + o.OffloadLayers = &layers + } +} diff --git a/file_estimate_test.go b/file_estimate_test.go index 5f7380f..4d98e5e 100644 --- a/file_estimate_test.go +++ b/file_estimate_test.go @@ -93,3 +93,33 @@ func TestGGUFFile_Estimate_ContextSize(t *testing.T) { }) } } + +func TestGGUFFile_Estimate_OffloadLayers(t *testing.T) { + ctx := context.Background() + + f, err := ParseGGUFFileFromHuggingFace( + ctx, + "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", + "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", + SkipLargeMetadata()) + if err != nil { + t.Fatal(err) + return + } + + cases := []struct { + name string + opts []GGUFEstimateOption + }{ + {"offload 0 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(0)}}, + {"offload 1 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(1)}}, + {"offload 10 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(10)}}, + {"offload all layers", []GGUFEstimateOption{WithContextSize(512)}}, + {"offload 33 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(33)}}, // exceeds the number of layers + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n") + }) + } +} diff --git a/file_tokenizer.go b/file_tokenizer.go index 2865a01..29ba9aa 100644 --- a/file_tokenizer.go +++ b/file_tokenizer.go @@ -2,10 +2,14 @@ package gguf_parser // GGUFTokenizerMetadata represents the tokenizer metadata of a GGUF file. type GGUFTokenizerMetadata struct { + /* Basic */ + // Model is the model of the tokenizer. Model string `json:"model"` // TokensLength is the size of tokens. - TokensLength uint64 `json:"tokenLength"` + TokensLength uint64 `json:"tokensLength"` + // MergeLength is the size of merges. + MergesLength uint64 `json:"mergesLength"` // AddedTokensLength is the size of added tokens after training. AddedTokensLength uint64 `json:"addedTokenLength"` // BOSTokenID is the ID of the beginning of sentence token. @@ -28,6 +32,13 @@ type GGUFTokenizerMetadata struct { // // Use -1 if the token is not found. PaddingTokenID int64 `json:"paddingTokenID"` + + /* Appendix */ + + // TokenSize is the size of tokens in bytes. + TokensSize int64 `json:"tokensSize"` + // MergesSize is the size of merges in bytes. + MergesSize int64 `json:"mergesSize"` } // Tokenizer returns the tokenizer metadata of a GGUF file. @@ -35,6 +46,7 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) { const ( modelKey = "tokenizer.ggml.model" tokensKey = "tokenizer.ggml.tokens" + mergesKey = "tokenizer.ggml.merges" addedTokensKey = "tokenizer.ggml.added_tokens" bosTokenIDKey = "tokenizer.ggml.bos_token_id" eosTokenIDKey = "tokenizer.ggml.eos_token_id" @@ -46,6 +58,7 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) { m, _ := gf.Header.MetadataKV.Index([]string{ modelKey, tokensKey, + mergesKey, addedTokensKey, bosTokenIDKey, eosTokenIDKey, @@ -64,7 +77,14 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) { gt.Model = v.ValueString() } if v, ok := m[tokensKey]; ok { - gt.TokensLength = v.ValueArray().Len + arr := v.ValueArray() + gt.TokensLength = arr.Len + gt.TokensSize = arr.Size + } + if v, ok := m[mergesKey]; ok { + arr := v.ValueArray() + gt.MergesLength = arr.Len + gt.MergesSize = arr.Size } if v, ok := m[addedTokensKey]; ok { gt.AddedTokensLength = v.ValueArray().Len