From 67ab8015510483ea2a62b61ea8e0e812d73ea3cb Mon Sep 17 00:00:00 2001 From: thxCode Date: Wed, 29 May 2024 13:18:52 +0800 Subject: [PATCH] refactor: general and cmd Signed-off-by: thxCode --- cmd/gguf-parser/main.go | 138 ++++++++++++++++++++++++++++------------ file.go | 2 +- file_architecture.go | 6 +- file_estimate.go | 2 +- file_model.go | 1 - 5 files changed, 101 insertions(+), 48 deletions(-) diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index ecbd577..86c63f9 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -34,9 +34,13 @@ func main() { // estimate options ctxSize = 512 kvType = "f16" - // output - json bool - jsonPretty = true + // output options + skipModel bool + skipArchitecture bool + skipTokenizer bool + skipEstimate bool + json bool + jsonPretty = true ) fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) fs.Usage = func() { @@ -58,8 +62,12 @@ func main() { fs.BoolVar(&mmap, "mmap", mmap, "Use mmap to read the local file") fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip using proxy when reading from a remote URL") fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL") - fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Maximum context size to estimate memory usage") + fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage") fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]") + fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata") + fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata") + fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip tokenizer metadata") + fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip estimate") fs.BoolVar(&json, "json", json, "Output as JSON") fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON") if err := fs.Parse(os.Args[1:]); err != nil { @@ -132,15 +140,40 @@ func main() { } } - m, a, e := gf.Model(), gf.Architecture(), gf.Estimate(eopts...) + var ( + m GGUFModelMetadata + a GGUFArchitectureMetadata + t GGUFTokenizerMetadata + e GGUFEstimate + ) + if !skipModel { + m = gf.Model() + } + if !skipArchitecture { + a = gf.Architecture() + } + if !skipTokenizer { + t = gf.Tokenizer() + } + if !skipEstimate { + e = gf.Estimate(eopts...) + } // Output if json { - o := map[string]any{ - "model": m, - "architecture": a, - "estimate": e, + o := map[string]any{} + if !skipModel { + o["model"] = m + } + if !skipArchitecture { + o["architecture"] = a + } + if !skipTokenizer { + o["tokenizer"] = t + } + if !skipEstimate { + o["estimate"] = e } enc := stdjson.NewEncoder(os.Stdout) @@ -155,39 +188,60 @@ func main() { return } - tprintf( - []string{"Name", "Architecture", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, - []string{ - m.Name, - m.Architecture, - sprintf(m.QuantizationVersion), - sprintf(m.FileType), - sprintf(m.LittleEndian), - m.Size.String(), - m.Parameters.String(), - m.BitsPerWeight.String(), - }) - - tprintf( - []string{"Context Length", "Embedding Length", "Layers", "Feed Forward Length", "Expert Count", "Vocabulary Length"}, - []string{ - sprintf(a.ContextLength), - sprintf(a.EmbeddingLength), - fmt.Sprintf("%d + 1 = %d", - a.BlockCount, - a.BlockCount+1), - sprintf(a.FeedForwardLength), - sprintf(a.ExpertCount), - sprintf(a.VocabularyLength), - }) - - tprintf( - []string{"Load Memory", "KVCache Memory", "Total Memory"}, - []string{ - e.MemoryLoad.String(), - e.KVCache.MemoryTotal.String(), - e.MemoryTotal.String(), - }) + if !skipModel { + tprintf( + []string{"Name", "Architecture", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, + []string{ + m.Name, + m.Architecture, + sprintf(m.QuantizationVersion), + sprintf(m.FileType), + sprintf(m.LittleEndian), + m.Size.String(), + m.Parameters.String(), + m.BitsPerWeight.String(), + }) + } + + if !skipArchitecture { + tprintf( + []string{"Maximum Context Length", "Embedding Length", "Layers", "Feed Forward Length", "Expert Count", "Vocabulary Length"}, + []string{ + sprintf(a.MaximumContextLength), + sprintf(a.EmbeddingLength), + fmt.Sprintf("%d + 1 = %d", + a.BlockCount, + a.BlockCount+1), + sprintf(a.FeedForwardLength), + sprintf(a.ExpertCount), + sprintf(a.VocabularyLength), + }) + } + + if !skipTokenizer { + tprintf( + []string{"Tokenizer Model", "Tokens Length", "Added Tokens Length", "BOS", "EOS", "Unknown", "Separator", "Padding"}, + []string{ + t.Model, + sprintf(t.TokensLength), + sprintf(t.AddedTokensLength), + sprintf(t.BOSTokenID), + sprintf(t.EOSTokenID), + sprintf(t.UnknownTokenID), + sprintf(t.SeparatorTokenID), + sprintf(t.PaddingTokenID), + }) + } + + if !skipEstimate { + tprintf( + []string{"Load Memory", "KVCache Memory", "Total Memory"}, + []string{ + e.MemoryLoad.String(), + e.KVCache.MemoryTotal.String(), + e.MemoryTotal.String(), + }) + } } func sprintf(a any) string { diff --git a/file.go b/file.go index 8db5c18..8ef076a 100644 --- a/file.go +++ b/file.go @@ -939,7 +939,7 @@ func (gf *GGUFFile) guessParameters() GGUFParametersScalar { // = BlockCount * (12 * EmbeddingLength * EmbeddingLength + 13 * EmbeddingLength) + VocabularyLength * EmbeddingLength ret := blockCount*(12*embeddingLength*embeddingLength+13*embeddingLength) + vocabularyLength*embeddingLength - // TODO MoE + // TODO MoE / SSM / RoPE. return GGUFParametersScalar(ret) } diff --git a/file_architecture.go b/file_architecture.go index dc94058..8e1dd31 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -2,13 +2,13 @@ package gguf_parser // GGUFArchitectureMetadata represents the architecture metadata of a GGUF file. type GGUFArchitectureMetadata struct { - // ContextLength(n_ctx_train) is the context length of the model. + // MaximumContextLength(n_ctx_train) is the maximum context length of the model. // // For most architectures, this is the hard limit on the length of the input. // Architectures, like RWKV, // that are not reliant on transformer-style attention may be able to handle larger inputs, // but this is not guaranteed. - ContextLength uint64 `json:"contextLength"` + MaximumContextLength uint64 `json:"maximumContextLength"` // EmbeddingLength(n_embd) is the length of the embedding layer. EmbeddingLength uint64 `json:"embeddingLength"` // BlockCount(n_layer) is the number of blocks of attention and feed-forward layers, @@ -147,7 +147,7 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { }) if v, ok := m[contextLengthKey]; ok { - ga.ContextLength = ValueNumeric[uint64](v) + ga.MaximumContextLength = ValueNumeric[uint64](v) } if v, ok := m[embeddingLengthKey]; ok { ga.EmbeddingLength = ValueNumeric[uint64](v) diff --git a/file_estimate.go b/file_estimate.go index 64bd663..e3398a6 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -49,7 +49,7 @@ func (gf *GGUFFile) estimateKVCache(a GGUFArchitectureMetadata, o _GGUFEstimateO var ( embedKeyGQA = uint64(a.AttentionKeyLength) * a.AttentionHeadCountKV embedValGQA = uint64(a.AttentionValueLength) * a.AttentionHeadCountKV - kvSize = a.ContextLength + kvSize = a.MaximumContextLength ) { // Correct. diff --git a/file_model.go b/file_model.go index bed34c5..42da58b 100644 --- a/file_model.go +++ b/file_model.go @@ -18,7 +18,6 @@ type GGUFModelMetadata struct { // Not required if the model is not quantized (i.e. no tensors are quantized). // If any tensors are quantized, this must be present. // This is separate to the quantization scheme of the tensors itself, - // // the quantization version may change without changing the scheme's name, // e.g. the quantization scheme is Q5_K, and the QuantizationVersion is 4. QuantizationVersion uint32 `json:"quantizationVersion,omitempty"`