diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md
index e7d2af1..1a83e61 100644
--- a/cmd/gguf-parser/README.md
+++ b/cmd/gguf-parser/README.md
@@ -7,7 +7,7 @@ Review/Check/Estimate [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/
 ```shell
 $ gguf-parser --help
 Usage of gguf-parser ...:
-  -batch-size int
+  -ubatch-size int
         Specify the physical maximum batch size, which is used to estimate the usage, default is 512. (default 512)
   -ctx-size int
         Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default -1)
diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
index a30cca3..829cbf4 100644
--- a/cmd/gguf-parser/main.go
+++ b/cmd/gguf-parser/main.go
@@ -32,10 +32,11 @@ func main() {
 		skipTLSVerify bool
 		// estimate options
 		ctxSize           = -1
-		batchSize         = 512
+		physicalBatchSize = 512
 		parallelSize      = 1
 		kvType            = "f16"
 		flashAttention    bool
+		platformFootprint = "150,250"
 		noMMap            bool
 		offloadLayers     = -1
 		offloadLayersStep uint64
@@ -45,6 +46,7 @@ func main() {
 		skipArchitecture bool
 		skipTokenizer    bool
 		skipEstimate     bool
+		inMib            bool
 		json             bool
 		jsonPretty       = true
 	)
@@ -61,15 +63,19 @@ func main() {
 		"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. "+
 		"Note that gguf-parser does not need to download the entire GGUF file.")
 	fs.StringVar(&repo, "repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+
-		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file.")
+		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file. [Deprecated, use --hf-repo instead]")
 	fs.StringVar(&file, "file", file, "Model file below the --repo, e.g. "+
+		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. [Deprecated, use --hf-file instead]") // Deprecated.
+	fs.StringVar(&repo, "hf-repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+
+		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.") // Deprecated.
+	fs.StringVar(&file, "hf-file", file, "Model file below the --repo, e.g. "+
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.")
 	fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.")
 	fs.BoolVar(&skipTLSVerify, "skip-tls-verify", skipTLSVerify, "Skip TLS verification, works with --url.")
 	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+
 		"which is used to estimate the usage, "+
 		"default is equal to the model's maximum context size.")
-	fs.IntVar(&batchSize, "batch-size", batchSize, "Specify the physical maximum batch size, "+
+	fs.IntVar(&physicalBatchSize, "ubatch-size", physicalBatchSize, "Specify the physical maximum batch size, "+
 		"which is used to estimate the usage, "+
 		"default is 512.")
 	fs.IntVar(&parallelSize, "parallel-size", parallelSize, "Specify the number of parallel sequences to decode, "+
@@ -82,20 +88,32 @@ func main() {
 	fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Specify enabling Flash Attention, "+
 		"which is used to estimate the usage. "+
 		"Flash Attention can reduce the usage of RAM/VRAM.")
+	fs.StringVar(&platformFootprint, "platform-footprint", platformFootprint, "Specify the platform footprint(RAM,VRAM) in MiB, "+
+		"which is used to estimate the NonUMA usage, "+
+		"default is 150,250. "+
+		"Different platform always gets different RAM and VRAM footprints, "+
+		"for example, within CUDA, `cudaMemGetInfo` would occupy some RAM and VRAM, "+
+		"see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.")
 	fs.BoolVar(&noMMap, "no-mmap", noMMap, "Specify disabling Memory-Mapped using, "+
 		"which is used to estimate the usage. "+
 		"Memory-Mapped can avoid loading the entire model weights into RAM.")
 	fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, "+
+		"which is used to estimate the usage, "+
+		"default is full offloaded. [Deprecated, use --gpu-layers instead]") // Deprecated.
+	fs.IntVar(&offloadLayers, "gpu-layers", offloadLayers, "Specify how many layers to offload, "+
 		"which is used to estimate the usage, "+
 		"default is full offloaded.")
 	fs.Uint64Var(&offloadLayersStep, "offload-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+
-		"works with --offload-layers.")
+		"works with --offload-layers. [Deprecated, use --gpu-layers-step instead]") // Deprecated.
+	fs.Uint64Var(&offloadLayersStep, "gpu-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+
+		"works with --gpu-layers.")
 	fs.BoolVar(&version, "version", version, "Show gguf-parser version.")
 	fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip to display model metadata.")
 	fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip to display architecture metadata.")
 	fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata")
 	fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip to estimate.")
-	fs.BoolVar(&json, "json", json, "Output as JSON,")
+	fs.BoolVar(&inMib, "in-mib", inMib, "Display the estimated result in table with MiB.")
+	fs.BoolVar(&json, "json", json, "Output as JSON.")
 	fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON.")
 	if err := fs.Parse(os.Args[1:]); err != nil {
 		fmt.Println(err.Error())
@@ -127,8 +145,8 @@ func main() {
 	if ctxSize > 0 {
 		eopts = append(eopts, WithContextSize(int32(ctxSize)))
 	}
-	if batchSize > 0 {
-		eopts = append(eopts, WithBatchSize(int32(batchSize)))
+	if physicalBatchSize > 0 {
+		eopts = append(eopts, WithPhysicalBatchSize(int32(physicalBatchSize)))
 	}
 	if parallelSize > 0 {
 		eopts = append(eopts, WithParallelSize(int32(parallelSize)))
@@ -208,6 +226,23 @@ func main() {
 	}
 
 	// Output
+	var (
+		mmap                      = !noMMap
+		platformRAM, platformVRAM uint64
+	)
+	{
+		if platformFootprint != "" {
+			parts := strings.Split(platformFootprint, ",")
+			if len(parts) == 2 {
+				if v, err := strconv.ParseUint(parts[0], 10, 64); err == nil {
+					platformRAM = v * 1024 * 1024
+				}
+				if v, err := strconv.ParseUint(parts[1], 10, 64); err == nil {
+					platformVRAM = v * 1024 * 1024
+				}
+			}
+		}
+	}
 
 	if json {
 		o := map[string]any{}
@@ -221,7 +256,7 @@ func main() {
 			o["tokenizer"] = t
 		}
 		if !skipEstimate {
-			es := e.Summarize(!noMMap)
+			es := e.Summarize(mmap, platformRAM, platformVRAM)
 			switch {
 			case offloadLayersStep > e.OffloadLayers:
 				offloadLayersStep = e.OffloadLayers
@@ -241,7 +276,7 @@ func main() {
 						defer wg.Done()
 						eopts := eopts[:len(eopts):len(eopts)]
 						eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep))
-						ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap)
+						ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM)
 					}(i)
 				}
 				wg.Wait()
@@ -263,6 +298,8 @@ func main() {
 		return
 	}
 
+	InMiBytes = inMib
+
 	if !skipModel {
 		tprint(
 			"MODEL",
@@ -313,7 +350,7 @@ func main() {
 	}
 
 	if !skipEstimate {
-		es := e.Summarize(!noMMap)
+		es := e.Summarize(mmap, platformRAM, platformVRAM)
 		switch {
 		case offloadLayersStep > e.OffloadLayers:
 			offloadLayersStep = e.OffloadLayers
@@ -333,7 +370,7 @@ func main() {
 					defer wg.Done()
 					eopts := eopts[:len(eopts):len(eopts)]
 					eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep))
-					ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap)
+					ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM)
 				}(i)
 			}
 			wg.Wait()
diff --git a/file.go b/file.go
index 569f8e4..7980b5e 100644
--- a/file.go
+++ b/file.go
@@ -514,27 +514,34 @@ const (
 	_PiBytes
 )
 
+var InMiBytes bool
+
 func (s GGUFBytesScalar) String() string {
 	if s == 0 {
 		return "0 B"
 	}
 	b, u := float64(1), "B"
-	switch {
-	case s >= _PiBytes:
-		b = _PiBytes
-		u = "PiB"
-	case s >= _TiBytes:
-		b = _TiBytes
-		u = "TiB"
-	case s >= _GiBytes:
-		b = _GiBytes
-		u = "GiB"
-	case s >= _MiBytes:
+	if InMiBytes {
 		b = _MiBytes
 		u = "MiB"
-	case s >= _KiBytes:
-		b = _KiBytes
-		u = "KiB"
+	} else {
+		switch {
+		case s >= _PiBytes:
+			b = _PiBytes
+			u = "PiB"
+		case s >= _TiBytes:
+			b = _TiBytes
+			u = "TiB"
+		case s >= _GiBytes:
+			b = _GiBytes
+			u = "GiB"
+		case s >= _MiBytes:
+			b = _MiBytes
+			u = "MiB"
+		case s >= _KiBytes:
+			b = _KiBytes
+			u = "KiB"
+		}
 	}
 	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
 	return strings.TrimSuffix(f, ".00") + " " + u
diff --git a/file_architecture.go b/file_architecture.go
index c242972..513787a 100644
--- a/file_architecture.go
+++ b/file_architecture.go
@@ -83,6 +83,8 @@ type GGUFArchitectureMetadata struct {
 
 	/* Appendix */
 
+	// EmbeddingGroup is the number of groups in the embedding layer.
+	EmbeddingGroup uint64 `json:"embeddingGroup,omitempty"`
 	// EmbeddingKeyGQA is the number of key GQA in the embedding layer.
 	EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"`
 	// EmbeddingValueGQA is the number of value GQA in the embedding layer.
@@ -274,6 +276,9 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) {
 	}
 
 	{
+		if ga.AttentionHeadCountKV > 0 {
+			ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV
+		}
 		if ga.AttentionHeadCount > 0 {
 			ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
 			ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV
diff --git a/file_estimate.go b/file_estimate.go
index 339bae5..0177e91 100644
--- a/file_estimate.go
+++ b/file_estimate.go
@@ -87,6 +87,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 	if o.CacheValueType == nil {
 		o.CacheValueType = ptr.To(GGMLTypeF16)
 	}
+	if o.PhysicalBatchSize == nil {
+		o.PhysicalBatchSize = ptr.To(int32(512))
+	}
 
 	// Architecture and tokenizer metadata.
 	var (
@@ -138,7 +141,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 		}
 		// Correct token size,
 		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224.
-		nTokens = min(nContext, uint64(ptr.Deref(o.BatchSize, 512)))
+		nTokens = min(nContext, uint64(*o.PhysicalBatchSize))
 		nBatch = nTokens
 		nOutputs = nTokens
 		nParallel = uint64(ptr.Deref(o.ParallelSize, 1))
@@ -230,12 +233,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 		if _, ok := opLs.Get("output.weight"); ok {
 			e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes())
 		} else {
-			e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes() + ioLs.Bytes() /* duplicate the input layer */)
-		}
-		if isOffloadOutputLayer && nLoadLayers == 0 { // Full offloaded.
-			// Transfer the output weight to VRAM when all layers are offloaded.
-			e.Offload.Weight.Output = e.Load.Weight.Output
-			e.Load.Weight.Output = 0
+			e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes()) + e.Load.Weight.Input /* duplicate the input layer */
 		}
 	}
 
@@ -318,38 +316,54 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 			}
 			e.Offload.Computation.Compute = GGUFBytesScalar(convInc + ssmInc)
 		} else {
-			attnInc := uint64(0)
+			loadAttnInc, offloadAttnInc := uint64(0), uint64(0)
 			if o.FlashAttention {
 				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387.
-				attnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens})
+				offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens})
 				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) {
 					if strings.HasSuffix(l.Name, ".attn_norm.weight") {
 						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
-						attnInc += rs
+						offloadAttnInc += rs
 						continue
 					}
 					rs := l.Bytes()
-					attnInc += rs
+					offloadAttnInc += rs
 				}
 				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L6986-L6992.
 				rs := o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
-				attnInc += rs
+				offloadAttnInc += rs
 				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7000-L7007.
 				rs = o.CacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV})
-				attnInc += rs
+				offloadAttnInc += rs
 			} else {
-				attnInc = uint64(e.Load.KVCache.Key + e.Offload.KVCache.Key)
+				offloadAttnInc = uint64(0)
 				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) {
-					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
-					attnInc += rs
+					var rs uint64
 					switch {
-					default:
-						continue
+					default: // norm.
+						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
+						offloadAttnInc += rs
 					case strings.HasSuffix(l.Name, ".attn_q.weight"):
+						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})
+						offloadAttnInc += rs * 2 // Qcur, Qcur + RoPE.
+						if !isOffloadOutputLayer {
+							loadAttnInc = rs // Vcur.
+						}
+						rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})
+						offloadAttnInc += rs // kq.
+						rs = o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
+						offloadAttnInc += rs * 2 // k-?, v-?.
 					case strings.HasSuffix(l.Name, ".attn_qkv.weight"):
 						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})
+						offloadAttnInc += rs * 2 // Qcur, Qcur + RoPE.
+						if !isOffloadOutputLayer {
+							loadAttnInc = rs // Vcur.
+						}
+						rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})
+						offloadAttnInc += rs // kq.
+						rs = o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
+						offloadAttnInc += rs * 2 // k-?, v-?.
 					}
-					attnInc += rs * 2 // for RoPE
 				}
 			}
 			ffnInc := uint64(0)
@@ -357,7 +371,8 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
 				ffnInc += rs
 			}
-			e.Offload.Computation.Compute = GGUFBytesScalar(max(attnInc, ffnInc))
+			e.Load.Computation.Compute = GGUFBytesScalar(loadAttnInc)
+			e.Offload.Computation.Compute = GGUFBytesScalar(max(offloadAttnInc, ffnInc))
 			// Special case: we cannot use mmap for splitting expert weights in MoE.
 			if a.ExpertCount > 0 {
 				e.NoMMap = len(tfLs[0].Search(regexp.MustCompile(`.*\.\d+\.ffn_gate_exps\.weight`))) == 0
@@ -425,7 +440,9 @@ type (
 	}
 )
 
-func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEstimateMemorySummary) {
+// SummarizeMemory returns the summary of the estimated memory usage of loading the GGUF file in llama.cpp,
+// the input options are used to adjust the summary.
+func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformVRAM uint64) (ems LLaMACppUsageEstimateMemorySummary) {
 	ems.OffloadLayers, ems.FullOffloaded = e.OffloadLayers, e.FullOffloaded
 	if ems.FullOffloaded {
 		ems.OffloadLayers++ // The output layer is offloaded.
@@ -443,31 +460,22 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEsti
 		}
 	}
 
-	// TODO(thxCode): complete more cases,
-	//  and support optional parameters for the following constants.
-	//
-	// Footprint,
-	// see https://github.com/ggerganov/llama.cpp/blob/f578b86b2123d0f92afbaa98a031df4d4464e582/llama.cpp#L2454-L2486.
-	const (
-		// The function `cudaMemGetInfo` occupies some memory,
-		// see https://github.com/ggerganov/llama.cpp/blob/f578b86b2123d0f92afbaa98a031df4d4464e582/ggml-cuda.cu#L3009-L3013,
-		// and https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.
-		cudaFootprint = GGUFBytesScalar(150 * 1024 * 1024)
-	)
-
 	// NonUMA.
 	{
 		// RAM.
-		fp := cudaFootprint + e.Load.Footprint
+		fp := GGUFBytesScalar(platformRAM) + e.Load.Footprint
 		wg := e.Load.Weight.Sum()
 		kv := e.Load.KVCache.Sum()
 		cp := e.Load.Computation.Sum()
 		ems.NonUMA.RAM = fp + wg + kv + cp
 		if !e.NoMMap && (mmap || e.FullOffloaded) {
 			ems.NonUMA.RAM -= wg
+			if !mmap {
+				ems.NonUMA.RAM += e.Load.Weight.Output
+			}
 		}
 		// VRAM.
-		fp = e.Offload.Footprint
+		fp = GGUFBytesScalar(platformVRAM) + e.Offload.Footprint
 		wg = e.Offload.Weight.Sum()
 		kv = e.Offload.KVCache.Sum()
 		cp = e.Offload.Computation.Sum()
@@ -477,10 +485,12 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEsti
 	return ems
 }
 
-func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummary) {
+// Summarize returns the summary of the estimated result of loading the GGUF file in llama.cpp,
+// the input options are used to adjust the summary.
+func (e LLaMACppUsageEstimate) Summarize(mmap bool, platformRAM, platformVRAM uint64) (es LLaMACppUsageEstimateSummary) {
 	// Summarize memory.
 	es.Memory = []LLaMACppUsageEstimateMemorySummary{
-		e.SummarizeMemory(mmap),
+		e.SummarizeMemory(mmap, platformRAM, platformVRAM),
 	}
 
 	// Just copy from the original estimate.
diff --git a/file_estimate_option.go b/file_estimate_option.go
index 813b54b..a0cf10f 100644
--- a/file_estimate_option.go
+++ b/file_estimate_option.go
@@ -6,15 +6,15 @@ import (
 
 type (
 	_LLaMACppUsageEstimateOptions struct {
-		Architecture   *GGUFArchitectureMetadata
-		Tokenizer      *GGUFTokenizerMetadata
-		ContextSize    *int32
-		BatchSize      *int32
-		ParallelSize   *int32
-		CacheKeyType   *GGMLType
-		CacheValueType *GGMLType
-		OffloadLayers  *uint64
-		FlashAttention bool
+		Architecture      *GGUFArchitectureMetadata
+		Tokenizer         *GGUFTokenizerMetadata
+		ContextSize       *int32
+		PhysicalBatchSize *int32
+		ParallelSize      *int32
+		CacheKeyType      *GGMLType
+		CacheValueType    *GGMLType
+		OffloadLayers     *uint64
+		FlashAttention    bool
 	}
 	LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions)
 )
@@ -47,13 +47,13 @@ func WithContextSize(size int32) LLaMACppUsageEstimateOption {
 	}
 }
 
-// WithBatchSize sets the physical batch size for the estimate.
-func WithBatchSize(size int32) LLaMACppUsageEstimateOption {
+// WithPhysicalBatchSize sets the physical batch size for the estimate.
+func WithPhysicalBatchSize(size int32) LLaMACppUsageEstimateOption {
 	return func(o *_LLaMACppUsageEstimateOptions) {
 		if size <= 0 {
 			return
 		}
-		o.BatchSize = &size
+		o.PhysicalBatchSize = &size
 	}
 }