diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 598452f..93f4b67 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -1315,6 +1315,7 @@ func mainAction(c *cli.Context) error { "Flash Attention", "MMap Load", "Embedding Only", + "Reranking", "Distributable", "Offload Layers", "Full Offloaded", @@ -1326,6 +1327,7 @@ func mainAction(c *cli.Context) error { "Flash Attention", "MMap Load", "Embedding Only", + "Reranking", "Distributable", "Offload Layers", "Full Offloaded", @@ -1385,6 +1387,7 @@ func mainAction(c *cli.Context) error { sprintf(tenary(flashAttention, tenary(es.FlashAttention, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(mmap, tenary(!es.NoMMap, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(es.EmbeddingOnly, "Yes", "No")), + sprintf(tenary(es.Reranking, "Supported", "Unsupported")), sprintf(tenary(es.Distributable, "Supported", "Unsupported")), sprintf(tenary(es.Items[i].FullOffloaded, sprintf("%d (%d + 1)", es.Items[i].OffloadLayers, es.Items[i].OffloadLayers-1), es.Items[i].OffloadLayers)), diff --git a/file_estimate.go b/file_estimate.go index 93b25fa..65fef70 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -34,6 +34,9 @@ type ( // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` + // Reranking is the flag to indicate whether the model is used for reranking, + // true for reranking. + Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` @@ -215,6 +218,10 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL if a.Type == "model" && !a.AttentionCausal { e.EmbeddingOnly = true o.PhysicalBatchSize = o.LogicalBatchSize + // Reranking. + if _, found := gf.TensorInfos.Index([]string{"cls.bias", "cls.weight"}); found > 0 { + e.Reranking = true + } } // Distributable, @@ -357,6 +364,9 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ "token_embd.weight", + "token_embd_norm.weight", + "token_embd_norm.bias", + "token_types.weight", "output.weight", "output.bias", "output_norm.weight", @@ -364,6 +374,9 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL }) ipLs, opLs, _ := ioLs.Cut([]string{ "token_embd.weight", + "token_embd_norm.weight", + "token_embd_norm.bias", + "token_types.weight", }) // Weight. @@ -685,6 +698,9 @@ type ( // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` + // Reranking is the flag to indicate whether the model is used for reranking, + // true for reranking. + Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` @@ -848,6 +864,7 @@ func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVram es.FlashAttention = e.FlashAttention es.NoMMap = e.NoMMap es.EmbeddingOnly = e.EmbeddingOnly + es.Reranking = e.Reranking es.LogicalBatchSize = e.LogicalBatchSize es.PhysicalBatchSize = e.PhysicalBatchSize es.Distributable = e.Distributable