From b3d65a46dc92525325e17c248d6f315cbc793408 Mon Sep 17 00:00:00 2001
From: thxCode <thxcode0824@gmail.com>
Date: Fri, 7 Jun 2024 19:08:17 +0800
Subject: [PATCH] refactor: simplify estimate

Signed-off-by: thxCode <thxcode0824@gmail.com>
---
 README.md                 |   6 --
 cmd/gguf-parser/main.go   |  54 ++++------
 file.go                   | 162 +++++++-----------------------
 file_architecture.go      |   6 ++
 file_architecture_test.go |   2 +-
 file_estimate.go          | 143 +++++++++++++--------------
 file_estimate_option.go   |  33 ++++---
 file_estimate_test.go     |  35 +------
 file_model_test.go        |   2 +-
 file_tokenizer_test.go    |   2 +-
 ggml.go                   | 203 ++++++++++++++++++++++++++++++++++++++
 11 files changed, 363 insertions(+), 285 deletions(-)
 create mode 100644 ggml.go

diff --git a/README.md b/README.md
index 4182db1..84013d1 100644
--- a/README.md
+++ b/README.md
@@ -121,12 +121,6 @@ spew.Dump(f.Estimate(WithContextSize(4096) /* 4K */))
 
 ```
 
-#### Estimate with specific offload layers
-
-```go
-spew.Dump(f.Estimate(WithOffloadLayers(10)))
-```
-
 ## License
 
 MIT
diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
index 3afa85d..9373f4f 100644
--- a/cmd/gguf-parser/main.go
+++ b/cmd/gguf-parser/main.go
@@ -32,9 +32,8 @@ func main() {
 		skipProxy bool
 		skipTLS   bool
 		// estimate options
-		ctxSize       = 512
-		kvType        = "f16"
-		offloadLayers uint64
+		ctxSize = 512
+		kvType  = "f16"
 		// output options
 		version          bool
 		skipModel        bool
@@ -65,7 +64,6 @@ func main() {
 	fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL")
 	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage")
 	fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]")
-	fs.Uint64Var(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, default is fully offloading")
 	fs.BoolVar(&version, "version", version, "Show version")
 	fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata")
 	fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata")
@@ -129,9 +127,6 @@ func main() {
 		}
 		eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv))
 	}
-	if offloadLayers > 0 {
-		eopts = append(eopts, WithOffloadLayers(offloadLayers))
-	}
 
 	// Parse GGUF file.
 
@@ -206,23 +201,23 @@ func main() {
 	if !skipModel {
 		tprintf(
 			"MODEL",
-			[]string{"Name", "Architecture", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"},
+			[]string{"Name", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"},
 			[]string{
 				m.Name,
 				m.Architecture,
 				sprintf(m.QuantizationVersion),
 				sprintf(m.FileType),
 				sprintf(m.LittleEndian),
-				m.Size.String(),
-				m.Parameters.String(),
-				m.BitsPerWeight.String(),
+				sprintf(m.Size),
+				sprintf(m.Parameters),
+				sprintf(m.BitsPerWeight),
 			})
 	}
 
 	if !skipArchitecture {
 		tprintf(
 			"ARCHITECTURE",
-			[]string{"Max Context Length", "Embedding Length", "Layers", "Feed Forward Length", "Expert Count", "Vocabulary Length"},
+			[]string{"Max Context Len", "Embedding Len", "Layers", "Feed Forward Len", "Expert Cnt", "Vocabulary Len"},
 			[]string{
 				sprintf(a.MaximumContextLength),
 				sprintf(a.EmbeddingLength),
@@ -242,7 +237,7 @@ func main() {
 		}
 		tprintf(
 			"TOKENIZER",
-			[]string{"Model", "Tokens Length", "Added Tokens Length", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"},
+			[]string{"Model", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"},
 			[]string{
 				t.Model,
 				sprintf(t.TokensLength),
@@ -256,30 +251,17 @@ func main() {
 	}
 
 	if !skipEstimate {
-		bs := [][]string{
-			{
-				"TOTAL",
-				sprintf(ctxSize),
-				e.Total.KVCache.Sum().String(),
-				e.Total.Compute.String(),
-				e.Total.IO.String(),
-				e.Total.Sum().String(),
-			},
-		}
-		if e.Offload != nil {
-			bs = append(bs, []string{
-				"OFFLOAD",
-				sprintf(ctxSize),
-				e.Offload.KVCache.Sum().String(),
-				e.Offload.Compute.String(),
-				e.Offload.IO.String(),
-				e.Offload.Sum().String(),
-			})
-		}
 		tprintf(
 			"ESTIMATE",
-			[]string{"/", "Context Length", "KV Cache", "Compute Memory", "IO Memory", "Sum"},
-			bs...)
+			[]string{"Context Size", "Model Weight", "KV Cache", "Computation Graph Overhead", "Others", "Usage (w/o MMap)"},
+			[]string{
+				sprintf(ctxSize),
+				sprintf(e.ModelWeight),
+				sprintf(e.KVCache.Sum()),
+				sprintf(e.ComputationGraphOverhead),
+				sprintf(e.Others),
+				sprintf(e.Sum()) + " (" + sprintf(e.Sum()+e.ModelWeight) + ")",
+			})
 	}
 }
 
@@ -323,7 +305,7 @@ func tprintf(title string, header []string, body ...[]string) {
 	tb.SetAlignment(tablewriter.ALIGN_CENTER)
 	tb.SetHeaderLine(true)
 	tb.SetRowLine(true)
-	tb.SetAutoMergeCells(true)
+	tb.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3})
 	tb.Append(append([]string{title}, header...))
 	for i := range body {
 		tb.Append(append([]string{title}, body[i]...))
diff --git a/file.go b/file.go
index c9a6ffb..1e3622d 100644
--- a/file.go
+++ b/file.go
@@ -165,59 +165,6 @@ type (
 	GGUFMetadataKVs []GGUFMetadataKV
 )
 
-// Types for GGMLType.
-type (
-	// GGMLType is a type of GGML tensor,
-	// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
-	GGMLType uint32
-
-	// GGMLTypeTrait holds the trait of a GGMLType,
-	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L564-L918.
-	GGMLTypeTrait struct {
-		BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.
-		TypeSize  uint64 // Original is uint32, in order to reduce conversion, here we use uint64.
-		Quantized bool
-	}
-)
-
-// GGMLType constants.
-//
-// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
-const (
-	GGMLTypeF32 GGMLType = iota
-	GGMLTypeF16
-	GGMLTypeQ4_0
-	GGMLTypeQ4_1
-	GGMLTypeQ4_2
-	GGMLTypeQ4_3
-	GGMLTypeQ5_0
-	GGMLTypeQ5_1
-	GGMLTypeQ8_0
-	GGMLTypeQ8_1
-	GGMLTypeQ2_K
-	GGMLTypeQ3_K
-	GGMLTypeQ4_K
-	GGMLTypeQ5_K
-	GGMLTypeQ6_K
-	GGMLTypeQ8_K
-	GGMLTypeIQ2_XXS
-	GGMLTypeIQ2_XS
-	GGMLTypeIQ3_XXS
-	GGMLTypeIQ1_S
-	GGMLTypeIQ4_NL
-	GGMLTypeIQ3_S
-	GGMLTypeIQ2_S
-	GGMLTypeIQ4_XS
-	GGMLTypeI8
-	GGMLTypeI16
-	GGMLTypeI32
-	GGMLTypeI64
-	GGMLTypeF64
-	GGMLTypeIQ1_M
-	GGMLTypeBF16
-	_GGMLTypeCount // Unknown
-)
-
 // Types for GGUFTensorInfo.
 type (
 	// GGUFTensorInfo represents a tensor info in a GGUF file.
@@ -458,7 +405,8 @@ func parseGGUFFile(s int64, f io.ReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, e
 
 // Types for GGUF hierarchical tensors.
 type (
-	// IGGUFTensorInfos is an interface for GGUFTensorInfos.
+	// IGGUFTensorInfos is an interface for GGUF tensor infos,
+	// which includes basic operations.
 	IGGUFTensorInfos interface {
 		// Get returns the GGUFTensorInfo with the given name,
 		// and true if found, and false otherwise.
@@ -468,10 +416,12 @@ type (
 		// Index returns a map value to the GGUFTensorInfo with the given names,
 		// and the number of names found.
 		Index(names []string) (infos map[string]GGUFTensorInfo, found int)
-		// Elements returns the number of elements of the GGUFTensorInfo.
+		// Elements returns the number of elements(parameters).
 		Elements() uint64
-		// Bytes returns the number of bytes of the GGUFTensorInfo.
+		// Bytes returns the number of bytes.
 		Bytes() uint64
+		// Count returns the number of tensors.
+		Count() uint64
 	}
 
 	// GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file,
@@ -496,7 +446,16 @@ type (
 )
 
 // Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.
-func (gf *GGUFFile) Layers() GGUFLayerTensorInfos {
+func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos {
+	ls := gf.layers()
+	if len(ignores) != 0 {
+		_, ls, _ = ls.Cut(ignores)
+		return ls
+	}
+	return ls
+}
+
+func (gf *GGUFFile) layers() GGUFLayerTensorInfos {
 	var ret GGUFLayerTensorInfos
 
 	pm := make(map[string]any)
@@ -921,73 +880,6 @@ func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataK
 	return values, found
 }
 
-// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType.
-var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{
-	GGMLTypeF32:     {BlockSize: 1, TypeSize: 4},
-	GGMLTypeF16:     {BlockSize: 1, TypeSize: 2},
-	GGMLTypeQ4_0:    {BlockSize: 32, TypeSize: 18, Quantized: true},
-	GGMLTypeQ4_1:    {BlockSize: 32, TypeSize: 20, Quantized: true},
-	GGMLTypeQ4_2:    {BlockSize: 0, TypeSize: 0}, // Deprecated
-	GGMLTypeQ4_3:    {BlockSize: 0, TypeSize: 0}, // Deprecated
-	GGMLTypeQ5_0:    {BlockSize: 32, TypeSize: 22, Quantized: true},
-	GGMLTypeQ5_1:    {BlockSize: 32, TypeSize: 24, Quantized: true},
-	GGMLTypeQ8_0:    {BlockSize: 32, TypeSize: 34, Quantized: true},
-	GGMLTypeQ8_1:    {BlockSize: 32, TypeSize: 36, Quantized: true},
-	GGMLTypeQ2_K:    {BlockSize: 256, TypeSize: 84, Quantized: true},
-	GGMLTypeQ3_K:    {BlockSize: 256, TypeSize: 110, Quantized: true},
-	GGMLTypeQ4_K:    {BlockSize: 256, TypeSize: 144, Quantized: true},
-	GGMLTypeQ5_K:    {BlockSize: 256, TypeSize: 176, Quantized: true},
-	GGMLTypeQ6_K:    {BlockSize: 256, TypeSize: 210, Quantized: true},
-	GGMLTypeQ8_K:    {BlockSize: 256, TypeSize: 292, Quantized: true},
-	GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true},
-	GGMLTypeIQ2_XS:  {BlockSize: 256, TypeSize: 74, Quantized: true},
-	GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true},
-	GGMLTypeIQ1_S:   {BlockSize: 256, TypeSize: 50, Quantized: true},
-	GGMLTypeIQ4_NL:  {BlockSize: 32, TypeSize: 18, Quantized: true},
-	GGMLTypeIQ3_S:   {BlockSize: 256, TypeSize: 110, Quantized: true},
-	GGMLTypeIQ2_S:   {BlockSize: 256, TypeSize: 82, Quantized: true},
-	GGMLTypeIQ4_XS:  {BlockSize: 256, TypeSize: 136, Quantized: true},
-	GGMLTypeI8:      {BlockSize: 1, TypeSize: 1},
-	GGMLTypeI16:     {BlockSize: 1, TypeSize: 2},
-	GGMLTypeI32:     {BlockSize: 1, TypeSize: 4},
-	GGMLTypeI64:     {BlockSize: 1, TypeSize: 8},
-	GGMLTypeF64:     {BlockSize: 1, TypeSize: 8},
-	GGMLTypeIQ1_M:   {BlockSize: 256, TypeSize: 56, Quantized: true},
-	GGMLTypeBF16:    {BlockSize: 1, TypeSize: 2},
-}
-
-// Trait returns the GGMLTypeTrait of the GGMLType.
-func (t GGMLType) Trait() (GGMLTypeTrait, bool) {
-	tt, ok := _GGMLTypeTraits[t]
-	return tt, ok
-}
-
-// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait,
-// which is inspired by
-// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.
-//
-// The index of the given dimensions means the number of dimension,
-// i.e. 0 is the first dimension, 1 is the second dimension, and so on.
-//
-// The value of the item is the number of elements in the corresponding dimension.
-func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {
-	if len(dimensions) == 0 {
-		panic(errors.New("no dimensions"))
-	}
-
-	tt, ok := t.Trait()
-	if !ok {
-		panic(fmt.Errorf("invalid type: %v", t))
-	}
-
-	// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643
-	ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size
-	for i := 1; i < len(dimensions); i++ {
-		ds *= dimensions[i]
-	}
-	return ds
-}
-
 // Get returns the GGUFTensorInfo with the given name,
 // and true if found, and false otherwise.
 func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool) {
@@ -1071,6 +963,12 @@ func (ti GGUFTensorInfo) Bytes() uint64 {
 	return ret
 }
 
+// Count returns the number of GGUF tensors of the GGUFTensorInfo,
+// which is always 1.
+func (ti GGUFTensorInfo) Count() uint64 {
+	return 1
+}
+
 // Get returns the GGUFTensorInfo with the given name,
 // and true if found, and false otherwise.
 func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {
@@ -1130,7 +1028,12 @@ func (tis GGUFTensorInfos) Bytes() uint64 {
 	return ret
 }
 
-// Get returns the GGUFTensorInfo with the given name,
+// Count returns the number of GGUF tensors of the GGUFTensorInfos.
+func (tis GGUFTensorInfos) Count() uint64 {
+	return uint64(len(tis))
+}
+
+// Get returns the IGGUFTensorInfos with the given name,
 // and true if found, and false otherwise.
 func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {
 	for i := range ltis {
@@ -1211,6 +1114,15 @@ func (ltis GGUFLayerTensorInfos) Bytes() uint64 {
 	return ret
 }
 
+// Count returns the number of GGUF tensors of the GGUFLayerTensorInfos.
+func (ltis GGUFLayerTensorInfos) Count() uint64 {
+	var ret uint64
+	for i := range ltis {
+		ret += ltis[i].Count()
+	}
+	return ret
+}
+
 // Cut splits the GGUFLayerTensorInfos into two parts,
 // and returns the GGUFLayerTensorInfos with the names that match the given names at first,
 // and the GGUFLayerTensorInfos without the names at second,
diff --git a/file_architecture.go b/file_architecture.go
index 8e1dd31..09c60b7 100644
--- a/file_architecture.go
+++ b/file_architecture.go
@@ -2,6 +2,10 @@ package gguf_parser
 
 // GGUFArchitectureMetadata represents the architecture metadata of a GGUF file.
 type GGUFArchitectureMetadata struct {
+	// Architecture describes what architecture this model implements.
+	//
+	// All lowercase ASCII, with only [a-z0-9]+ characters allowed.
+	Architecture string `json:"architecture"`
 	// MaximumContextLength(n_ctx_train) is the maximum context length of the model.
 	//
 	// For most architectures, this is the hard limit on the length of the input.
@@ -114,6 +118,8 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) {
 		tokenizerGGMLTokensKey = "tokenizer.ggml.tokens"
 	)
 
+	ga.Architecture = arch
+
 	m, _ := gf.Header.MetadataKV.Index([]string{
 		contextLengthKey,
 		embeddingLengthKey,
diff --git a/file_architecture_test.go b/file_architecture_test.go
index 84fa433..b6b17a6 100644
--- a/file_architecture_test.go
+++ b/file_architecture_test.go
@@ -14,7 +14,7 @@ func TestGGUFFile_Architecture(t *testing.T) {
 	f, err := ParseGGUFFileFromHuggingFace(
 		ctx,
 		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-		"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
+		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
 		SkipLargeMetadata())
 	if err != nil {
 		t.Fatal(err)
diff --git a/file_estimate.go b/file_estimate.go
index 366aa85..186f02b 100644
--- a/file_estimate.go
+++ b/file_estimate.go
@@ -1,22 +1,20 @@
 package gguf_parser
 
-// GGUFEstimate represents the estimated result of the GGUF file.
-type GGUFEstimate struct {
-	// Offload is the offloaded layers usage.
-	Offload *GGUFMemoryUsage `json:"offload,omitempty"`
-	// Total is the total memory usage.
-	Total GGUFMemoryUsage `json:"total"`
-}
+import (
+	"github.com/thxcode/gguf-parser-go/util/ptr"
+)
 
 type (
-	// GGUFMemoryUsage represents the memory usage of the GGUF file.
-	GGUFMemoryUsage struct {
+	// GGUFEstimate represents the estimated result of the GGUF file.
+	GGUFEstimate struct {
+		// ModelWeight is the memory usage of model weight.
+		ModelWeight GGUFBytesScalar `json:"modelWeight"`
 		// KVCache is the usage of key-value cache.
 		KVCache GGUFKVCacheUsage `json:"kvCache"`
-		// Compute is the usage of transformer layers.
-		Compute GGUFBytesScalar `json:"compute"`
-		// IO is the usage of input/output layers.
-		IO GGUFBytesScalar `json:"io"`
+		// ComputationGraphOverhead is the overhead of computation graph.
+		ComputationGraphOverhead GGUFBytesScalar `json:"computationGraphOverhead"`
+		// Others is the trivial usage.
+		Others GGUFBytesScalar `json:"others"`
 	}
 
 	// GGUFKVCacheUsage represents the usage of kv-cache.
@@ -35,89 +33,90 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) {
 		opt(&o)
 	}
 
-	ge.Offload, ge.Total = gf.estimateMemoryUsage(gf.Architecture(), o)
-	return ge
-}
-
-func (m GGUFMemoryUsage) Sum() GGUFBytesScalar {
-	return m.Compute + m.KVCache.Sum() + m.IO
-}
+	a := gf.Architecture()
 
-func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar {
-	return c.Key + c.Value
-}
-
-func (gf *GGUFFile) estimateMemoryUsage(a GGUFArchitectureMetadata, o _GGUFEstimateOptions) (offload *GGUFMemoryUsage, total GGUFMemoryUsage) {
-	if o.OffloadLayers != nil {
-		offload = &GGUFMemoryUsage{}
+	contextSize := a.MaximumContextLength
+	if o.ContextSize != nil {
+		contextSize = uint64(*o.ContextSize)
 	}
 
-	// KV cache.
-	// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501
+	// Model weight.
+	ge.ModelWeight = gf.ModelSize
+
+	// KV cache,
+	// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.
 	{
 		kt, vt := GGMLTypeF16, GGMLTypeF16
-
+		kvSize := contextSize
 		if o.CacheKeyType != nil {
 			kt = *o.CacheKeyType
 		}
 		if o.CacheValueType != nil {
 			vt = *o.CacheValueType
 		}
+		if a.Architecture == "mamba" {
+			// See https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129.
+			kt, vt = GGMLTypeF32, GGMLTypeF32
+			kvSize = uint64(ptr.Deref(o.ParallelSize, 1))
+		}
 
 		var (
 			embedKeyGQA = uint64(a.AttentionKeyLength) * a.AttentionHeadCountKV
 			embedValGQA = uint64(a.AttentionValueLength) * a.AttentionHeadCountKV
-			kvSize      = a.MaximumContextLength
 		)
-		{
-			// Correct.
-			if a.SSMConvolutionKernel > 0 {
-				embedKeyGQA += uint64(a.SSMConvolutionKernel - 1*a.SSMInnerSize)
-				embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize)
-			}
-			if o.ContextSize != nil {
-				kvSize = uint64(*o.ContextSize)
-			}
+		if a.SSMConvolutionKernel > 0 {
+			embedKeyGQA += uint64(a.SSMConvolutionKernel - 1*a.SSMInnerSize)
+			embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize)
 		}
 
 		krs := kt.RowSizeOf([]uint64{embedKeyGQA * kvSize})
 		vrs := vt.RowSizeOf([]uint64{embedValGQA * kvSize})
 
-		if offload != nil {
-			v := *o.OffloadLayers
-			if v > a.BlockCount {
-				v = a.BlockCount
-			}
-			offload.KVCache.Key = GGUFBytesScalar(krs * v)
-			offload.KVCache.Value = GGUFBytesScalar(vrs * v)
-		}
+		ge.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount)
+		ge.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount)
+	}
 
-		total.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount)
-		total.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount)
+	// Others.
+	{
+		// Overhead
+		ge.Others += GGUFBytesScalar(15 * 1024 * 1024) // NB(thxCode): Magic here.
+
+		// GGML context,
+		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
+		ggmlCtx := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3)
+		ge.Others += GGUFBytesScalar(ggmlCtx)
+
+		// Output buffer,
+		// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003.
+		outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * uint64(ptr.Deref(o.ParallelSize, 1))
+		ge.Others += GGUFBytesScalar(outBuffer)
 	}
 
-	ls := gf.Layers()
-	bls, als, _ := ls.Cut([]string{
-		"token_embd.weight",
-		"output.weight",
-		"output_norm.weight",
-	})
-
-	// IO.
-	total.IO = GGUFBytesScalar(bls.Bytes())
-
-	// Compute.
-	if offload != nil {
-		v := *o.OffloadLayers
-		if v >= a.BlockCount {
-			offload.Compute = GGUFBytesScalar(als.Bytes())
-		} else {
-			for i := uint64(len(als) - 1); i >= uint64(len(als))-v; i-- {
-				offload.Compute += GGUFBytesScalar(als[i].Bytes())
-			}
-		}
+	// Computation graph.
+	{
+		graphOverhead := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum +
+			GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false)
+		ge.ComputationGraphOverhead += GGUFBytesScalar(graphOverhead)
+
+		var (
+			nBatch = min(contextSize, uint64(ptr.Deref(o.BatchSize, 512)))
+
+			inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]
+			inpEmbd   = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch]
+			inpPos    = GGMLTypeI32.RowSizeOf([]uint64{contextSize})               // I32 [n_tokens]
+			inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{contextSize})               // I32 [n_output],
+			inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{contextSize, nBatch})       // F32 [n_kv, n_batch]
+		)
+		ge.ComputationGraphOverhead += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)
 	}
-	total.Compute = GGUFBytesScalar(als.Bytes())
 
-	return offload, total
+	return ge
+}
+
+func (e GGUFEstimate) Sum() GGUFBytesScalar {
+	return e.KVCache.Sum() + e.ComputationGraphOverhead + e.Others
+}
+
+func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar {
+	return c.Key + c.Value
 }
diff --git a/file_estimate_option.go b/file_estimate_option.go
index d63ef41..80f2b10 100644
--- a/file_estimate_option.go
+++ b/file_estimate_option.go
@@ -7,9 +7,10 @@ import (
 type (
 	_GGUFEstimateOptions struct {
 		ContextSize    *int32
+		ParallelSize   *int32
+		BatchSize      *int32
 		CacheKeyType   *GGMLType
 		CacheValueType *GGMLType
-		OffloadLayers  *uint64
 	}
 	GGUFEstimateOption func(*_GGUFEstimateOptions)
 )
@@ -24,6 +25,26 @@ func WithContextSize(size int32) GGUFEstimateOption {
 	}
 }
 
+// WithParallelSize sets the (decoding sequences) parallel size for the estimate.
+func WithParallelSize(size int32) GGUFEstimateOption {
+	return func(o *_GGUFEstimateOptions) {
+		if size <= 0 {
+			return
+		}
+		o.ParallelSize = &size
+	}
+}
+
+// WithBatchSize sets the physical batch size for the estimate.
+func WithBatchSize(size int32) GGUFEstimateOption {
+	return func(o *_GGUFEstimateOptions) {
+		if size <= 0 {
+			return
+		}
+		o.BatchSize = &size
+	}
+}
+
 // _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types.
 var _GGUFEstimateCacheTypeAllowList = []GGMLType{
 	GGMLTypeF32,
@@ -51,13 +72,3 @@ func WithCacheValueType(t GGMLType) GGUFEstimateOption {
 		}
 	}
 }
-
-// WithOffloadLayers sets the number of layers to offload.
-func WithOffloadLayers(layers uint64) GGUFEstimateOption {
-	return func(o *_GGUFEstimateOptions) {
-		if layers <= 0 {
-			return
-		}
-		o.OffloadLayers = &layers
-	}
-}
diff --git a/file_estimate_test.go b/file_estimate_test.go
index d922174..5f7380f 100644
--- a/file_estimate_test.go
+++ b/file_estimate_test.go
@@ -20,7 +20,7 @@ func TestGGUFFile_Estimate(t *testing.T) {
 				f, err := ParseGGUFFileFromHuggingFace(
 					ctx,
 					"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-					"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
+					"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
 					SkipLargeMetadata())
 				if err != nil {
 					t.Fatal(err)
@@ -65,13 +65,13 @@ func TestGGUFFile_Estimate(t *testing.T) {
 	}
 }
 
-func TestGGUFFile_Estimate_KVCache(t *testing.T) {
+func TestGGUFFile_Estimate_ContextSize(t *testing.T) {
 	ctx := context.Background()
 
 	f, err := ParseGGUFFileFromHuggingFace(
 		ctx,
 		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-		"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
+		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
 		SkipLargeMetadata())
 	if err != nil {
 		t.Fatal(err)
@@ -93,32 +93,3 @@ func TestGGUFFile_Estimate_KVCache(t *testing.T) {
 		})
 	}
 }
-
-func TestGGUFFile_Estimate_Offload(t *testing.T) {
-	ctx := context.Background()
-
-	f, err := ParseGGUFFileFromHuggingFace(
-		ctx,
-		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-		"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
-		SkipLargeMetadata())
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-
-	cases := []struct {
-		name string
-		opts []GGUFEstimateOption
-	}{
-		{"offload 0 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(0)}},
-		{"offload 1 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(1)}},
-		{"offload 10 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(10)}},
-		{"offload 33 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(33)}}, // exceeds the number of layers
-	}
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n")
-		})
-	}
-}
diff --git a/file_model_test.go b/file_model_test.go
index c52019a..5400944 100644
--- a/file_model_test.go
+++ b/file_model_test.go
@@ -16,7 +16,7 @@ func TestGGUFFile_Model(t *testing.T) {
 	f, err := ParseGGUFFileFromHuggingFace(
 		ctx,
 		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-		"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
+		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
 		SkipLargeMetadata())
 	if err != nil {
 		t.Fatal(err)
diff --git a/file_tokenizer_test.go b/file_tokenizer_test.go
index 39cdf44..26ba749 100644
--- a/file_tokenizer_test.go
+++ b/file_tokenizer_test.go
@@ -14,7 +14,7 @@ func TestGGUFFile_Tokenizer(t *testing.T) {
 	f, err := ParseGGUFFileFromHuggingFace(
 		ctx,
 		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
-		"Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf",
+		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
 		SkipLargeMetadata())
 	if err != nil {
 		t.Fatal(err)
diff --git a/ggml.go b/ggml.go
new file mode 100644
index 0000000..d438589
--- /dev/null
+++ b/ggml.go
@@ -0,0 +1,203 @@
+package gguf_parser
+
+import (
+	"errors"
+	"fmt"
+	"slices"
+)
+
+// Types for GGMLType.
+type (
+	// GGMLType is a type of GGML tensor,
+	// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
+	GGMLType uint32
+
+	// GGMLTypeTrait holds the trait of a GGMLType,
+	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L564-L918.
+	GGMLTypeTrait struct {
+		BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.
+		TypeSize  uint64 // Original is uint32, in order to reduce conversion, here we use uint64.
+		Quantized bool
+	}
+)
+
+// GGMLType constants.
+//
+// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
+const (
+	GGMLTypeF32 GGMLType = iota
+	GGMLTypeF16
+	GGMLTypeQ4_0
+	GGMLTypeQ4_1
+	GGMLTypeQ4_2
+	GGMLTypeQ4_3
+	GGMLTypeQ5_0
+	GGMLTypeQ5_1
+	GGMLTypeQ8_0
+	GGMLTypeQ8_1
+	GGMLTypeQ2_K
+	GGMLTypeQ3_K
+	GGMLTypeQ4_K
+	GGMLTypeQ5_K
+	GGMLTypeQ6_K
+	GGMLTypeQ8_K
+	GGMLTypeIQ2_XXS
+	GGMLTypeIQ2_XS
+	GGMLTypeIQ3_XXS
+	GGMLTypeIQ1_S
+	GGMLTypeIQ4_NL
+	GGMLTypeIQ3_S
+	GGMLTypeIQ2_S
+	GGMLTypeIQ4_XS
+	GGMLTypeI8
+	GGMLTypeI16
+	GGMLTypeI32
+	GGMLTypeI64
+	GGMLTypeF64
+	GGMLTypeIQ1_M
+	GGMLTypeBF16
+	_GGMLTypeCount // Unknown
+)
+
+// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType.
+var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{
+	GGMLTypeF32:     {BlockSize: 1, TypeSize: 4},
+	GGMLTypeF16:     {BlockSize: 1, TypeSize: 2},
+	GGMLTypeQ4_0:    {BlockSize: 32, TypeSize: 18, Quantized: true},
+	GGMLTypeQ4_1:    {BlockSize: 32, TypeSize: 20, Quantized: true},
+	GGMLTypeQ4_2:    {BlockSize: 0, TypeSize: 0}, // Deprecated
+	GGMLTypeQ4_3:    {BlockSize: 0, TypeSize: 0}, // Deprecated
+	GGMLTypeQ5_0:    {BlockSize: 32, TypeSize: 22, Quantized: true},
+	GGMLTypeQ5_1:    {BlockSize: 32, TypeSize: 24, Quantized: true},
+	GGMLTypeQ8_0:    {BlockSize: 32, TypeSize: 34, Quantized: true},
+	GGMLTypeQ8_1:    {BlockSize: 32, TypeSize: 36, Quantized: true},
+	GGMLTypeQ2_K:    {BlockSize: 256, TypeSize: 84, Quantized: true},
+	GGMLTypeQ3_K:    {BlockSize: 256, TypeSize: 110, Quantized: true},
+	GGMLTypeQ4_K:    {BlockSize: 256, TypeSize: 144, Quantized: true},
+	GGMLTypeQ5_K:    {BlockSize: 256, TypeSize: 176, Quantized: true},
+	GGMLTypeQ6_K:    {BlockSize: 256, TypeSize: 210, Quantized: true},
+	GGMLTypeQ8_K:    {BlockSize: 256, TypeSize: 292, Quantized: true},
+	GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true},
+	GGMLTypeIQ2_XS:  {BlockSize: 256, TypeSize: 74, Quantized: true},
+	GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true},
+	GGMLTypeIQ1_S:   {BlockSize: 256, TypeSize: 50, Quantized: true},
+	GGMLTypeIQ4_NL:  {BlockSize: 32, TypeSize: 18, Quantized: true},
+	GGMLTypeIQ3_S:   {BlockSize: 256, TypeSize: 110, Quantized: true},
+	GGMLTypeIQ2_S:   {BlockSize: 256, TypeSize: 82, Quantized: true},
+	GGMLTypeIQ4_XS:  {BlockSize: 256, TypeSize: 136, Quantized: true},
+	GGMLTypeI8:      {BlockSize: 1, TypeSize: 1},
+	GGMLTypeI16:     {BlockSize: 1, TypeSize: 2},
+	GGMLTypeI32:     {BlockSize: 1, TypeSize: 4},
+	GGMLTypeI64:     {BlockSize: 1, TypeSize: 8},
+	GGMLTypeF64:     {BlockSize: 1, TypeSize: 8},
+	GGMLTypeIQ1_M:   {BlockSize: 256, TypeSize: 56, Quantized: true},
+	GGMLTypeBF16:    {BlockSize: 1, TypeSize: 2},
+}
+
+// Trait returns the GGMLTypeTrait of the GGMLType.
+func (t GGMLType) Trait() (GGMLTypeTrait, bool) {
+	tt, ok := _GGMLTypeTraits[t]
+	return tt, ok
+}
+
+// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait,
+// which is inspired by
+// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.
+//
+// The index of the given dimensions means the number of dimension,
+// i.e. 0 is the first dimension, 1 is the second dimension, and so on.
+//
+// The value of the item is the number of elements in the corresponding dimension.
+func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {
+	if len(dimensions) == 0 {
+		panic(errors.New("no dimensions"))
+	}
+
+	tt, ok := t.Trait()
+	if !ok {
+		panic(fmt.Errorf("invalid type: %v", t))
+	}
+
+	// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643
+	ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size
+	for i := 1; i < len(dimensions); i++ {
+		ds *= dimensions[i]
+	}
+	return ds
+}
+
+// GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding,
+// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255.
+func GGMLMemoryPadding(size uint64) uint64 {
+	// https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243
+	const align = 16
+	return (size + align - 1) &^ (align - 1)
+}
+
+// GGML tensor constants.
+const (
+	// GGMLTensorSize is the size of GGML tensor in bytes,
+	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606.
+	GGMLTensorSize = 368
+
+	// GGMLObjectSize is the size of GGML object in bytes,
+	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563.
+	GGMLObjectSize = 32
+)
+
+// GGMLTensorOverhead is the overhead of GGML tensor in bytes,
+// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767.
+func GGMLTensorOverhead() uint64 {
+	return GGMLObjectSize + GGMLTensorSize
+}
+
+// GGML computation graph constants.
+const (
+	// GGMLComputationGraphSize is the size of GGML computation graph in bytes.
+	GGMLComputationGraphSize = 80
+
+	// GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph,
+	// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103.
+	GGMLComputationGraphNodesMaximum = 8192
+
+	// GGMLComputationGraphNodesDefault is the default nodes of the computation graph,
+	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237.
+	GGMLComputationGraphNodesDefault = 2048
+)
+
+// GGMLComputationGraphOverhead is the overhead of GGML graph in bytes,
+// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917.
+func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {
+	const pointerSize = 8
+
+	var g uint64 = GGMLComputationGraphSize
+	g += pointerSize * nodes * 2
+	if grads {
+		g += pointerSize * nodes
+	}
+	g += pointerSize * GGMLHashSize(nodes)
+
+	return GGMLObjectSize + GGMLMemoryPadding(g)
+}
+
+// GGMLHashSize returns the size of the hash table for the given base,
+// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722.
+func GGMLHashSize(base uint64) uint64 {
+	primes := []uint64{
+		2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
+		2053, 4099, 8209, 16411, 32771, 65537, 131101,
+		262147, 524309, 1048583, 2097169, 4194319, 8388617,
+		16777259, 33554467, 67108879, 134217757, 268435459,
+		536870923, 1073741827, 2147483659,
+	}
+	i, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int {
+		if t >= e {
+			return 0
+		}
+		return -1
+	})
+	if !ok {
+		return base | 1
+	}
+	return primes[i]
+}