diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
index 9373f4f..577a57c 100644
--- a/cmd/gguf-parser/main.go
+++ b/cmd/gguf-parser/main.go
@@ -28,12 +28,15 @@ func main() {
 		repo, model string
 		// read options
 		debug     bool
-		mmap      = true
 		skipProxy bool
 		skipTLS   bool
 		// estimate options
-		ctxSize = 512
-		kvType  = "f16"
+		ctxSize       = -1
+		kvType        = "f16"
+		offloadLayers = -1
+		batchSize     = 512
+		parallel      = 1
+		noMMap        bool
 		// output options
 		version          bool
 		skipModel        bool
@@ -59,11 +62,14 @@ func main() {
 	fs.StringVar(&model, "model", model, "Model below the --repo, e.g. "+
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf")
 	fs.BoolVar(&debug, "debug", debug, "Debug mode")
-	fs.BoolVar(&mmap, "mmap", mmap, "Use mmap to read the local file")
 	fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip using proxy when reading from a remote URL")
 	fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL")
-	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage")
+	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage, default is equal to the model's maximum context size")
 	fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]")
+	fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, default is fully offloading")
+	fs.IntVar(&batchSize, "batch-size", batchSize, "Physical maximum batch size")
+	fs.IntVar(&parallel, "parallel", parallel, "Number of parallel sequences to decode")
+	fs.BoolVar(&noMMap, "no-mmap", noMMap, "Do not use memory-mapping, which influences the estimate result")
 	fs.BoolVar(&version, "version", version, "Show version")
 	fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata")
 	fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata")
@@ -85,13 +91,11 @@ func main() {
 
 	ropts := []GGUFReadOption{
 		SkipLargeMetadata(),
+		UseMMap(),
 	}
 	if debug {
 		ropts = append(ropts, UseDebug())
 	}
-	if mmap {
-		ropts = append(ropts, UseMMap())
-	}
 	if skipProxy {
 		ropts = append(ropts, SkipProxy())
 	}
@@ -99,11 +103,12 @@ func main() {
 		ropts = append(ropts, SkipTLSVerification())
 	}
 
-	if ctxSize <= 0 {
-		ctxSize = 512
-	}
 	eopts := []GGUFEstimateOption{
-		WithContextSize(int32(ctxSize)),
+		WithCacheValueType(GGMLTypeF16),
+		WithCacheKeyType(GGMLTypeF16),
+	}
+	if ctxSize > 0 {
+		eopts = append(eopts, WithContextSize(int32(ctxSize)))
 	}
 	if kvType != "" {
 		kv := GGMLTypeF16
@@ -127,6 +132,15 @@ func main() {
 		}
 		eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv))
 	}
+	if offloadLayers >= 0 {
+		eopts = append(eopts, WithOffloadLayers(uint64(offloadLayers)))
+	}
+	if batchSize > 0 {
+		eopts = append(eopts, WithBatchSize(int32(batchSize)))
+	}
+	if parallel > 0 {
+		eopts = append(eopts, WithParallelSize(int32(parallel)))
+	}
 
 	// Parse GGUF file.
 
@@ -183,7 +197,8 @@ func main() {
 			o["tokenizer"] = t
 		}
 		if !skipEstimate {
-			o["estimate"] = e
+			es := e.Sum(!noMMap)
+			o["estimate"] = es
 		}
 
 		enc := stdjson.NewEncoder(os.Stdout)
@@ -237,9 +252,10 @@ func main() {
 		}
 		tprintf(
 			"TOKENIZER",
-			[]string{"Model", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"},
+			[]string{"Model", "Tokens Size", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"},
 			[]string{
 				t.Model,
+				sprintf(GGUFBytesScalar(t.TokensSize)),
 				sprintf(t.TokensLength),
 				sprintf(t.AddedTokensLength),
 				sprintTokenID(t.BOSTokenID),
@@ -251,16 +267,29 @@ func main() {
 	}
 
 	if !skipEstimate {
+		es := e.Sum(!noMMap)
+		if ctxSize <= 0 {
+			if a.MaximumContextLength == 0 {
+				a = gf.Architecture()
+			}
+			ctxSize = int(a.MaximumContextLength)
+		}
 		tprintf(
 			"ESTIMATE",
-			[]string{"Context Size", "Model Weight", "KV Cache", "Computation Graph Overhead", "Others", "Usage (w/o MMap)"},
+			[]string{"Mem. Arch", "MMap", "Context Size", "(CPU) RAM", "(GPU) VRAM"},
+			[]string{
+				"UMA",
+				sprintf(!noMMap),
+				sprintf(ctxSize),
+				sprintf(es.UMA.RAM),
+				sprintf(es.UMA.VRAM),
+			},
 			[]string{
+				"NonUMA",
+				sprintf(!noMMap),
 				sprintf(ctxSize),
-				sprintf(e.ModelWeight),
-				sprintf(e.KVCache.Sum()),
-				sprintf(e.ComputationGraphOverhead),
-				sprintf(e.Others),
-				sprintf(e.Sum()) + " (" + sprintf(e.Sum()+e.ModelWeight) + ")",
+				sprintf(es.NonUMA.RAM),
+				sprintf(es.NonUMA.VRAM),
 			})
 	}
 }
diff --git a/file.go b/file.go
index 1e3622d..612b5c0 100644
--- a/file.go
+++ b/file.go
@@ -159,6 +159,9 @@ type (
 		//
 		// The offset is the start of the file.
 		StartOffset int64 `json:"startOffset"`
+
+		// Size is the size of the array in bytes.
+		Size int64 `json:"endOffset"`
 	}
 
 	// GGUFMetadataKVs is a list of GGUFMetadataKV.
@@ -1286,6 +1289,11 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) {
 		return v, fmt.Errorf("read array length: %w", err)
 	}
 
+	itemStart, err := rd.f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return v, fmt.Errorf("seek array item start: %w", err)
+	}
+
 	if !rd.o.SkipLargeMetadata {
 		v.Array = make([]any, v.Len)
 		for i := uint64(0); i < v.Len; i++ {
@@ -1295,6 +1303,12 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) {
 			}
 		}
 
+		itemEnd, err := rd.f.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return v, fmt.Errorf("seek array item end: %w", err)
+		}
+		v.Size = itemEnd - itemStart
+
 		return v, nil
 	}
 
@@ -1321,6 +1335,12 @@ func (rd _GGUFReader) ReadArray() (v GGUFMetadataKVArrayValue, err error) {
 		return v, fmt.Errorf("seek array end: %w", err)
 	}
 
+	itemEnd, err := rd.f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return v, fmt.Errorf("seek array item end: %w", err)
+	}
+	v.Size = itemEnd - itemStart
+
 	return v, nil
 }
 
diff --git a/file_estimate.go b/file_estimate.go
index 186f02b..7a43727 100644
--- a/file_estimate.go
+++ b/file_estimate.go
@@ -1,20 +1,41 @@
 package gguf_parser
 
 import (
+	"regexp"
+	"strings"
+
 	"github.com/thxcode/gguf-parser-go/util/ptr"
 )
 
+// GGUFEstimate represents the estimated result of the GGUF file.
+type GGUFEstimate struct {
+	// Load is the memory usage of the load part.
+	Load GGUFMemoryUsage `json:"load"`
+	// Offload is the memory usage of the offload part.
+	Offload GGUFMemoryUsage `json:"offload"`
+}
+
 type (
-	// GGUFEstimate represents the estimated result of the GGUF file.
-	GGUFEstimate struct {
-		// ModelWeight is the memory usage of model weight.
-		ModelWeight GGUFBytesScalar `json:"modelWeight"`
+	// GGUFMemoryUsage represents the memory usage of the GGUF file.
+	GGUFMemoryUsage struct {
+		// Weight is the memory usage of weight.
+		Weight GGUFWeightUsage `json:"weight"`
 		// KVCache is the usage of key-value cache.
 		KVCache GGUFKVCacheUsage `json:"kvCache"`
-		// ComputationGraphOverhead is the overhead of computation graph.
-		ComputationGraphOverhead GGUFBytesScalar `json:"computationGraphOverhead"`
-		// Others is the trivial usage.
-		Others GGUFBytesScalar `json:"others"`
+		// Tokens is the memory usage of token.
+		Tokens GGUFBytesScalar `json:"tokens"`
+		// Compute is the memory usage of computation.
+		Compute GGUFComputeUsage `json:"compute"`
+	}
+
+	// GGUFWeightUsage represents the memory usage of model weight.
+	GGUFWeightUsage struct {
+		// Compute is the memory usage of computing.
+		Compute GGUFBytesScalar `json:"compute"`
+		// Input is the memory usage of input.
+		Input GGUFBytesScalar `json:"input"`
+		// Output is the memory usage of output.
+		Output GGUFBytesScalar `json:"output"`
 	}
 
 	// GGUFKVCacheUsage represents the usage of kv-cache.
@@ -24,6 +45,14 @@ type (
 		// Value is the memory usage of the cached value.
 		Value GGUFBytesScalar `json:"value"`
 	}
+
+	// GGUFComputeUsage represents the memory usage of computation.
+	GGUFComputeUsage struct {
+		// Graph is the memory usage of computation graph.
+		Graph GGUFBytesScalar `json:"graph"`
+		// Others is the trivial usage.
+		Others GGUFBytesScalar `json:"others"`
+	}
 )
 
 // Estimate returns the inference usage estimated result of the GGUF file.
@@ -33,21 +62,69 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) {
 		opt(&o)
 	}
 
-	a := gf.Architecture()
+	a, t := gf.Architecture(), gf.Tokenizer()
 
-	contextSize := a.MaximumContextLength
+	nContext := a.MaximumContextLength
 	if o.ContextSize != nil {
-		contextSize = uint64(*o.ContextSize)
+		nContext = uint64(*o.ContextSize)
+	}
+
+	var (
+		nLoadLayers    = a.BlockCount
+		nOffloadLayers uint64
+		nBatch         = min(nContext, uint64(ptr.Deref(o.BatchSize, 512)))
+		nParallel      = uint64(ptr.Deref(o.ParallelSize, 1))
+	)
+	{
+		if v := o.OffloadLayers; v == nil {
+			o.OffloadLayers = ptr.To(a.BlockCount)
+			nOffloadLayers = nLoadLayers
+		} else if *v > 0 {
+			nOffloadLayers = *v
+			if nOffloadLayers > nLoadLayers {
+				nOffloadLayers = nLoadLayers
+			}
+		}
+		nLoadLayers -= nOffloadLayers
 	}
 
+	ls := gf.Layers()
+	ioLs, tfLs, _ := ls.Cut([]string{
+		"token_embd.weight",
+		"output.weight",
+		"output_norm.weight",
+	})
+
 	// Model weight.
-	ge.ModelWeight = gf.ModelSize
+	{
+		// Compute.
+		for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ {
+			switch {
+			case i < nLoadLayers:
+				ge.Load.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
+			case i >= offloadStart:
+				ge.Offload.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
+			}
+		}
+
+		// IO,
+		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002.
+		inpLs, outLs, _ := ioLs.Cut([]string{
+			"token_embd.weight",
+		})
+		ge.Load.Weight.Input = GGUFBytesScalar(inpLs.Bytes())
+		ge.Load.Weight.Output = GGUFBytesScalar(outLs.Bytes())
+		if nOffloadLayers == a.BlockCount {
+			ge.Offload.Weight.Output = ge.Load.Weight.Output
+			ge.Load.Weight.Output = 0
+		}
+	}
 
 	// KV cache,
 	// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.
 	{
 		kt, vt := GGMLTypeF16, GGMLTypeF16
-		kvSize := contextSize
+		nKV := nContext
 		if o.CacheKeyType != nil {
 			kt = *o.CacheKeyType
 		}
@@ -57,7 +134,7 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) {
 		if a.Architecture == "mamba" {
 			// See https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129.
 			kt, vt = GGMLTypeF32, GGMLTypeF32
-			kvSize = uint64(ptr.Deref(o.ParallelSize, 1))
+			nKV = nParallel
 		}
 
 		var (
@@ -69,54 +146,139 @@ func (gf *GGUFFile) Estimate(opts ...GGUFEstimateOption) (ge GGUFEstimate) {
 			embedValGQA += uint64(a.SSMStateSize * a.SSMInnerSize)
 		}
 
-		krs := kt.RowSizeOf([]uint64{embedKeyGQA * kvSize})
-		vrs := vt.RowSizeOf([]uint64{embedValGQA * kvSize})
+		krs := kt.RowSizeOf([]uint64{embedKeyGQA * nKV})
+		vrs := vt.RowSizeOf([]uint64{embedValGQA * nKV})
 
-		ge.KVCache.Key = GGUFBytesScalar(krs * a.BlockCount)
-		ge.KVCache.Value = GGUFBytesScalar(vrs * a.BlockCount)
+		ge.Load.KVCache.Key = GGUFBytesScalar(krs * nLoadLayers)
+		ge.Load.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers)
+		ge.Offload.KVCache.Key = GGUFBytesScalar(krs * nOffloadLayers)
+		ge.Offload.KVCache.Value = GGUFBytesScalar(vrs * nOffloadLayers)
 	}
 
-	// Others.
+	// Tokens.
+	ge.Load.Tokens += GGUFBytesScalar(t.TokensSize)
+	ge.Load.Tokens += GGUFBytesScalar(t.TokensLength * (4 /* token type */ + 4 /* token score*/))
+	if t.Model == "gpt2" {
+		ge.Load.Tokens += GGUFBytesScalar(t.MergesSize)
+		ge.Load.Tokens += GGUFBytesScalar(t.MergesLength * (48 /* key type */ + 56 /* value type */))
+	}
+
+	// Compute.
 	{
-		// Overhead
-		ge.Others += GGUFBytesScalar(15 * 1024 * 1024) // NB(thxCode): Magic here.
+		// Bootstrap.
+		ge.Load.Compute.Others += GGUFBytesScalar(15 * 1024 * 1024)
 
 		// GGML context,
 		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
 		ggmlCtx := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3)
-		ge.Others += GGUFBytesScalar(ggmlCtx)
+		ge.Load.Compute.Others += GGUFBytesScalar(ggmlCtx)
 
 		// Output buffer,
 		// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003.
-		outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * uint64(ptr.Deref(o.ParallelSize, 1))
-		ge.Others += GGUFBytesScalar(outBuffer)
-	}
+		outBuffer := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel
+		ge.Load.Compute.Others += GGUFBytesScalar(outBuffer)
 
-	// Computation graph.
-	{
+		// Graph overhead.
 		graphOverhead := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum +
 			GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false)
-		ge.ComputationGraphOverhead += GGUFBytesScalar(graphOverhead)
+		ge.Load.Compute.Others += GGUFBytesScalar(graphOverhead)
+	}
 
+	// Computation graph.
+	{
+		// Tensor usage,
+		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
+		//
+		// Firstly, get the usage of input tensors.
 		var (
-			nBatch = min(contextSize, uint64(ptr.Deref(o.BatchSize, 512)))
-
 			inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]
 			inpEmbd   = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch]
-			inpPos    = GGMLTypeI32.RowSizeOf([]uint64{contextSize})               // I32 [n_tokens]
-			inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{contextSize})               // I32 [n_output],
-			inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{contextSize, nBatch})       // F32 [n_kv, n_batch]
+			inpPos    = GGMLTypeI32.RowSizeOf([]uint64{nContext})                  // I32 [n_tokens]
+			inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nContext})                  // I32 [n_output],
+			inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nContext, nBatch})          // F32 [n_kv, n_batch]
 		)
-		ge.ComputationGraphOverhead += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)
+		ge.Load.Compute.Graph += GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)
+		if nOffloadLayers > 0 {
+			ge.Offload.Compute.Graph += GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds)
+		}
+		// Since the steps between transformer layers are serial,
+		// the allocated memory can be reused for the next layer.
+		// So, we only consider the usage of the largest layer,
+		// which is the last layer by default.
+		kvcInc := uint64(ge.Load.KVCache.Key + ge.Offload.KVCache.Key)
+		for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q)\.weight`)) {
+			rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
+			kvcInc += rs
+			if strings.HasSuffix(l.Name, ".attn_q.weight") {
+				kvcInc += rs // for RoPE
+			}
+		}
+		var ffnInc uint64
+		for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) {
+			rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
+			ffnInc += rs
+		}
+		if nLoadLayers == a.BlockCount {
+			ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc))
+		} else {
+			ge.Offload.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc))
+			if nLoadLayers > 0 {
+				ffnInc = 0
+				for _, l := range tfLs[nLoadLayers-1].Search(regexp.MustCompile(`.*\.\d+\.ffn_(norm|gate|up)\.weight`)) {
+					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
+					ffnInc += rs
+				}
+				ge.Load.Compute.Graph += GGUFBytesScalar(max(kvcInc, ffnInc))
+			}
+		}
 	}
 
 	return ge
 }
 
-func (e GGUFEstimate) Sum() GGUFBytesScalar {
-	return e.KVCache.Sum() + e.ComputationGraphOverhead + e.Others
+type (
+	GGUFEstimateSum struct {
+		// UMA is the usage of unified memory architecture.
+		UMA GGUFEstimateSumItem `json:"uma"`
+		// NonUMA is the usage of non-unified memory architecture.
+		NonUMA GGUFEstimateSumItem `json:"nonUMA"`
+	}
+	GGUFEstimateSumItem struct {
+		// RAM is the memory usage of the RAM.
+		RAM GGUFBytesScalar `json:"ram"`
+		// VRAM is the memory usage of the VRAM.
+		VRAM GGUFBytesScalar `json:"vram"`
+	}
+)
+
+func (e GGUFEstimate) Sum(mmap bool) (gs GGUFEstimateSum) {
+	gs.UMA = GGUFEstimateSumItem{
+		RAM:  e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Others,
+		VRAM: e.Offload.Compute.Sum(),
+	}
+	if !mmap {
+		gs.UMA.RAM += e.Load.Weight.Sum()
+		gs.UMA.VRAM += e.Offload.Weight.Sum()
+	}
+	gs.NonUMA = GGUFEstimateSumItem{
+		RAM:  e.Load.KVCache.Sum() + e.Load.Tokens + e.Load.Compute.Sum(),
+		VRAM: e.Offload.KVCache.Sum() + e.Offload.Compute.Sum(),
+	}
+	if !mmap {
+		gs.NonUMA.RAM += e.Load.Weight.Sum()
+		gs.NonUMA.VRAM += e.Offload.Weight.Sum()
+	}
+	return gs
+}
+
+func (w GGUFWeightUsage) Sum() GGUFBytesScalar {
+	return w.Compute + w.Input + w.Output
 }
 
 func (c GGUFKVCacheUsage) Sum() GGUFBytesScalar {
 	return c.Key + c.Value
 }
+
+func (c GGUFComputeUsage) Sum() GGUFBytesScalar {
+	return c.Graph + c.Others
+}
diff --git a/file_estimate_option.go b/file_estimate_option.go
index 80f2b10..5fb93ef 100644
--- a/file_estimate_option.go
+++ b/file_estimate_option.go
@@ -11,6 +11,7 @@ type (
 		BatchSize      *int32
 		CacheKeyType   *GGMLType
 		CacheValueType *GGMLType
+		OffloadLayers  *uint64
 	}
 	GGUFEstimateOption func(*_GGUFEstimateOptions)
 )
@@ -72,3 +73,10 @@ func WithCacheValueType(t GGMLType) GGUFEstimateOption {
 		}
 	}
 }
+
+// WithOffloadLayers sets the number of layers to offload.
+func WithOffloadLayers(layers uint64) GGUFEstimateOption {
+	return func(o *_GGUFEstimateOptions) {
+		o.OffloadLayers = &layers
+	}
+}
diff --git a/file_estimate_test.go b/file_estimate_test.go
index 5f7380f..4d98e5e 100644
--- a/file_estimate_test.go
+++ b/file_estimate_test.go
@@ -93,3 +93,33 @@ func TestGGUFFile_Estimate_ContextSize(t *testing.T) {
 		})
 	}
 }
+
+func TestGGUFFile_Estimate_OffloadLayers(t *testing.T) {
+	ctx := context.Background()
+
+	f, err := ParseGGUFFileFromHuggingFace(
+		ctx,
+		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
+		SkipLargeMetadata())
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+
+	cases := []struct {
+		name string
+		opts []GGUFEstimateOption
+	}{
+		{"offload 0 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(0)}},
+		{"offload 1 layer", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(1)}},
+		{"offload 10 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(10)}},
+		{"offload all layers", []GGUFEstimateOption{WithContextSize(512)}},
+		{"offload 33 layers", []GGUFEstimateOption{WithContextSize(512), WithOffloadLayers(33)}}, // exceeds the number of layers
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Log("\n", spew.Sdump(f.Estimate(tc.opts...)), "\n")
+		})
+	}
+}
diff --git a/file_tokenizer.go b/file_tokenizer.go
index 2865a01..29ba9aa 100644
--- a/file_tokenizer.go
+++ b/file_tokenizer.go
@@ -2,10 +2,14 @@ package gguf_parser
 
 // GGUFTokenizerMetadata represents the tokenizer metadata of a GGUF file.
 type GGUFTokenizerMetadata struct {
+	/* Basic */
+
 	// Model is the model of the tokenizer.
 	Model string `json:"model"`
 	// TokensLength is the size of tokens.
-	TokensLength uint64 `json:"tokenLength"`
+	TokensLength uint64 `json:"tokensLength"`
+	// MergeLength is the size of merges.
+	MergesLength uint64 `json:"mergesLength"`
 	// AddedTokensLength is the size of added tokens after training.
 	AddedTokensLength uint64 `json:"addedTokenLength"`
 	// BOSTokenID is the ID of the beginning of sentence token.
@@ -28,6 +32,13 @@ type GGUFTokenizerMetadata struct {
 	//
 	// Use -1 if the token is not found.
 	PaddingTokenID int64 `json:"paddingTokenID"`
+
+	/* Appendix */
+
+	// TokenSize is the size of tokens in bytes.
+	TokensSize int64 `json:"tokensSize"`
+	// MergesSize is the size of merges in bytes.
+	MergesSize int64 `json:"mergesSize"`
 }
 
 // Tokenizer returns the tokenizer metadata of a GGUF file.
@@ -35,6 +46,7 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
 	const (
 		modelKey            = "tokenizer.ggml.model"
 		tokensKey           = "tokenizer.ggml.tokens"
+		mergesKey           = "tokenizer.ggml.merges"
 		addedTokensKey      = "tokenizer.ggml.added_tokens"
 		bosTokenIDKey       = "tokenizer.ggml.bos_token_id"
 		eosTokenIDKey       = "tokenizer.ggml.eos_token_id"
@@ -46,6 +58,7 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
 	m, _ := gf.Header.MetadataKV.Index([]string{
 		modelKey,
 		tokensKey,
+		mergesKey,
 		addedTokensKey,
 		bosTokenIDKey,
 		eosTokenIDKey,
@@ -64,7 +77,14 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
 		gt.Model = v.ValueString()
 	}
 	if v, ok := m[tokensKey]; ok {
-		gt.TokensLength = v.ValueArray().Len
+		arr := v.ValueArray()
+		gt.TokensLength = arr.Len
+		gt.TokensSize = arr.Size
+	}
+	if v, ok := m[mergesKey]; ok {
+		arr := v.ValueArray()
+		gt.MergesLength = arr.Len
+		gt.MergesSize = arr.Size
 	}
 	if v, ok := m[addedTokensKey]; ok {
 		gt.AddedTokensLength = v.ValueArray().Len