diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md new file mode 100644 index 0000000..a1f38f3 --- /dev/null +++ b/cmd/gguf-parser/README.md @@ -0,0 +1,234 @@ +# GGUF Parser + +Review/Check/Estimate [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file. + +## Usage + +```shell +$ gguf-parser --help +Usage of gguf-parser ...: + -batch-size int + Specify the physical maximum batch size, which is used to estimate the usage, default is 512. (default 512) + -ctx-size int + Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default -1) + -debug + Enable debugging, verbosity. + -file string + Model file below the --repo, e.g. Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. + -flash-attention + Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. + -json + Output as JSON, + -json-pretty + Output as pretty JSON. (default true) + -kv-type string + Specify the type of Key-Value cache, which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], default is f16. Use quantization type means enabling --flash-attention as well. (default "f16") + -no-mmap + Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. + -offload-layers int + Specify how many layers to offload, which is used to estimate the usage, default is full offloaded. (default -1) + -offload-layers-step uint + Specify the step of layers to offload, works with --offload-layers. + -parallel-size int + Specify the number of parallel sequences to decode, which is used to estimate the usage, default is 1. (default 1) + -path string + Path where the GGUF file to load, e.g. ~/.cache/lm-studio/models/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. + -repo string + Repository of HuggingFace which the GGUF file store, e.g. NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file. + -skip-architecture + Skip to display architecture metadata. + -skip-estimate + Skip to estimate. + -skip-model + Skip to display model metadata. + -skip-tls-verify + Skip TLS verification, works with --url. + -skip-tokenizer + Skip to display tokenizer metadata + -url string + Url where the GGUF file to load, e.g. https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file. + -version + Show gguf-parser version. +``` + +### Parse + +#### parse local GGUF file + +```shell +$ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf" ++-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++ +-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| | jeffq | llama | 2 | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | ++-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ + ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 32032 | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ + ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | ++ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A | ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ + ++----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ +| | llama | 32768 | false | true | 32 | 4.09 GiB | 238.39 MiB | 10.70 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ + +``` + +#### parse remote GGUF file + +```shell +$ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" ++-------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ +| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++ +----------+-------+----------------------+-------------+---------------+--------+------------+----------+ +| | emozilla | llama | 2 | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | ++-------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ + ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 8 | 32002 | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ + ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | ++ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 | ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ + ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 32 | 25.08 GiB | 395.24 MiB | 26.94 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Parse HuggingFace GGUF file + +```shell +$ gguf-parser --repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --file="ggml-model-Q5_K_M.gguf" ++-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++ +-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| | model | llama | 2 | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | ++-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ + ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| | 8192 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 128256 | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ + ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | ++ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 | ++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ + ++----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ +| | llama | 8192 | false | true | 32 | 1.08 GiB | 234.61 MiB | 6.25 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ + +``` + +### Estimate + +#### Estimate with zero layers offload + +```shell +$ gguf-parser --repo="mradermacher/Falcon2-8B-Dutch-GGUF" --file="Falcon2-8B-Dutch.Q5_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=0 ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Estimate with specific layers offload + +```shell +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Estimate with specific context size + +```shell +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --ctx-size=4096 ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 4096 | false | false | 32 | 21.53 GiB | 339.24 MiB | 21.64 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Estimate with Flash Attention + +```shell +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --flash-attention ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | true | false | 32 | 25.08 GiB | 395.24 MiB | 25.08 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Estimate with No MMap + +```shell +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --no-mmap ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +#### Estimate step-by-step offload layers + +```shell +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --offload-layers-step=5 ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 5 | | 21.36 GiB | 5.97 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 10 | | 17.50 GiB | 9.83 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 15 | | 13.63 GiB | 13.70 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 20 | | 9.77 GiB | 17.56 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 25 | | 5.91 GiB | 21.42 GiB | ++ + + + + +----------------+ +------------+-------------+ +| | | | | | 32 | | 395.24 MiB | 26.94 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ + +``` + +## License + +MIT diff --git a/cmd/gguf-parser/go.mod b/cmd/gguf-parser/go.mod index aa4cdde..f6441fe 100644 --- a/cmd/gguf-parser/go.mod +++ b/cmd/gguf-parser/go.mod @@ -10,7 +10,6 @@ require ( ) require ( - github.com/dustin/go-humanize v1.0.1 // indirect github.com/henvic/httpretty v0.1.3 // indirect github.com/mattn/go-runewidth v0.0.9 // indirect github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b // indirect diff --git a/cmd/gguf-parser/go.sum b/cmd/gguf-parser/go.sum index 59b81c1..2d428fb 100644 --- a/cmd/gguf-parser/go.sum +++ b/cmd/gguf-parser/go.sum @@ -1,7 +1,5 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/henvic/httpretty v0.1.3 h1:4A6vigjz6Q/+yAfTD4wqipCv+Px69C7Th/NhT0ApuU8= github.com/henvic/httpretty v0.1.3/go.mod h1:UUEv7c2kHZ5SPQ51uS3wBpzPDibg2U3Y+IaXyHy5GBg= github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 3a94237..cca74ea 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -7,6 +7,7 @@ import ( "context" "strconv" "strings" + "sync" stdjson "encoding/json" "github.com/olekukonko/tablewriter" @@ -22,22 +23,22 @@ func main() { // Parse arguments. var ( - // model - path string - url string - repo, model string + // model options + path string + url string + repo, file string // read options - debug bool - skipProxy bool - skipTLS bool + debug bool + skipTLSVerify bool // estimate options - ctxSize = -1 - batchSize = 512 - parallelSize = 1 - kvType = "f16" - offloadLayers = -1 - flashAttention bool - noMMap bool + ctxSize = -1 + batchSize = 512 + parallelSize = 1 + kvType = "f16" + flashAttention bool + noMMap bool + offloadLayers = -1 + offloadLayersStep uint64 // output options version bool skipModel bool @@ -52,34 +53,50 @@ func main() { _, _ = fmt.Fprintf(fs.Output(), "Usage of gguf-parser %v:\n", Version) fs.PrintDefaults() } - fs.StringVar(&path, "path", path, "Path to load model, e.g. ~/.cache"+ + fs.StringVar(&path, "path", path, "Path where the GGUF file to load, e.g. ~/.cache"+ "/lm-studio/models/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF/"+ - "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf") - fs.StringVar(&url, "url", url, "Url to load model, e.g. "+ + "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.") + fs.StringVar(&url, "url", url, "Url where the GGUF file to load, e.g. "+ "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF"+ - "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf") - fs.StringVar(&repo, "repo", repo, "Repo of HuggingFace, e.g. "+ - "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF") - fs.StringVar(&model, "model", model, "Model below the --repo, e.g. "+ - "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf") - fs.BoolVar(&debug, "debug", debug, "Debug mode") - fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip using proxy when reading from a remote URL") - fs.BoolVar(&skipTLS, "skip-tls", skipTLS, "Skip TLS verification when reading from a remote URL") - fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Context size to estimate memory usage, default is equal to the model's maximum context size") - fs.IntVar(&batchSize, "batch-size", batchSize, "Physical maximum batch size") - fs.IntVar(¶llelSize, "parallel", parallelSize, "Number of parallel sequences to decode") - fs.StringVar(&kvType, "kv-type", kvType, "Key-Value cache type, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], "+ - "using quantization type means enabling Flash Attention as well") - fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, default is fully offloading") - fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Enable Flash Attention to reduce the memory usage, which influences the estimate result") - fs.BoolVar(&noMMap, "no-mmap", noMMap, "Disable using memory-mapped model(file) loading, which influences the estimate result") - fs.BoolVar(&version, "version", version, "Show version") - fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip model metadata") - fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip architecture metadata") - fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip tokenizer metadata") - fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip estimate") - fs.BoolVar(&json, "json", json, "Output as JSON") - fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON") + "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. "+ + "Note that gguf-parser does not need to download the entire GGUF file.") + fs.StringVar(&repo, "repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+ + "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file.") + fs.StringVar(&file, "file", file, "Model file below the --repo, e.g. "+ + "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.") + fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.") + fs.BoolVar(&skipTLSVerify, "skip-tls-verify", skipTLSVerify, "Skip TLS verification, works with --url.") + fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+ + "which is used to estimate the usage, "+ + "default is equal to the model's maximum context size.") + fs.IntVar(&batchSize, "batch-size", batchSize, "Specify the physical maximum batch size, "+ + "which is used to estimate the usage, "+ + "default is 512.") + fs.IntVar(¶llelSize, "parallel-size", parallelSize, "Specify the number of parallel sequences to decode, "+ + "which is used to estimate the usage, "+ + "default is 1.") + fs.StringVar(&kvType, "kv-type", kvType, "Specify the type of Key-Value cache, "+ + "which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], "+ + "default is f16. "+ + "Use quantization type means enabling --flash-attention as well.") + fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Specify enabling Flash Attention, "+ + "which is used to estimate the usage. "+ + "Flash Attention can reduce the usage of RAM/VRAM.") + fs.BoolVar(&noMMap, "no-mmap", noMMap, "Specify disabling Memory-Mapped using, "+ + "which is used to estimate the usage. "+ + "Memory-Mapped can avoid loading the entire model weights into RAM.") + fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, "+ + "which is used to estimate the usage, "+ + "default is full offloaded.") + fs.Uint64Var(&offloadLayersStep, "offload-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ + "works with --offload-layers.") + fs.BoolVar(&version, "version", version, "Show gguf-parser version.") + fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip to display model metadata.") + fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip to display architecture metadata.") + fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata") + fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip to estimate.") + fs.BoolVar(&json, "json", json, "Output as JSON,") + fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON.") if err := fs.Parse(os.Args[1:]); err != nil { fmt.Println(err.Error()) os.Exit(1) @@ -99,10 +116,7 @@ func main() { if debug { ropts = append(ropts, UseDebug()) } - if skipProxy { - ropts = append(ropts, SkipProxy()) - } - if skipTLS { + if skipTLSVerify { ropts = append(ropts, SkipTLSVerification()) } @@ -141,9 +155,6 @@ func main() { } eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv)) } - if offloadLayers >= 0 { - eopts = append(eopts, WithOffloadLayers(uint64(offloadLayers))) - } if flashAttention { eopts = append(eopts, WithFlashAttention()) } @@ -152,6 +163,8 @@ func main() { var gf *GGUFFile { + ropts := ropts[:len(ropts):len(ropts)] + var err error switch { default: @@ -161,8 +174,8 @@ func main() { gf, err = ParseGGUFFile(path, ropts...) case url != "": gf, err = ParseGGUFFileRemote(ctx, url, ropts...) - case repo != "" && model != "": - gf, err = ParseGGUFFileFromHuggingFace(ctx, repo, model, ropts...) + case repo != "" && file != "": + gf, err = ParseGGUFFileFromHuggingFace(ctx, repo, file, ropts...) } if err != nil { _, _ = fmt.Fprintf(os.Stderr, "failed to parse GGUF file: %s\n", err.Error()) @@ -179,13 +192,18 @@ func main() { if !skipModel { m = gf.Model() } - if !skipArchitecture { + if !skipArchitecture && !skipEstimate { a = gf.Architecture() } - if !skipTokenizer { + if !skipTokenizer && !skipEstimate { t = gf.Tokenizer() } if !skipEstimate { + eopts := eopts[:len(eopts):len(eopts)] + + if offloadLayers >= 0 { + eopts = append(eopts, WithOffloadLayers(uint64(offloadLayers))) + } e = gf.EstimateLLaMACppUsage(eopts...) } @@ -204,6 +222,28 @@ func main() { } if !skipEstimate { es := e.Summarize(!noMMap) + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers + } + if offloadLayersStep < e.OffloadLayers { + ess := make([]LLaMACppUsageEstimateMemorySummary, e.OffloadLayers/offloadLayersStep+1) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap) + }(i) + } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess + } o["estimate"] = es } @@ -270,27 +310,45 @@ func main() { if !skipEstimate { es := e.Summarize(!noMMap) - tprint( - "ESTIMATE", - []string{"Arch", "Context Size", "Full Offload", "Flash Attention", "MMap Support", "Mem. Arch", "Usage"}, - []string{ - sprintf(es.Architecture), - sprintf(es.ContextSize), - sprintf(es.FullOffload), - sprintf(es.FlashAttention), - sprintf(!es.NoMMap), - "UMA", - sprintf(es.UMA), - }, - []string{ + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers + } + if offloadLayersStep < e.OffloadLayers { + ess := make([]LLaMACppUsageEstimateMemorySummary, e.OffloadLayers/offloadLayersStep+1) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap) + }(i) + } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess + } + bd := make([][]string, len(es.Memory)) + for i := range es.Memory { + bd[i] = []string{ sprintf(es.Architecture), sprintf(es.ContextSize), - sprintf(es.FullOffload), sprintf(es.FlashAttention), sprintf(!es.NoMMap), - "NonUMA", - sprintf("%s (RAM) + %s (VRAM)", es.NonUMA.RAM, es.NonUMA.VRAM), - }) + sprintf(es.Memory[i].OffloadLayers), + sprintf(es.Memory[i].UMA), + sprintf(es.Memory[i].NonUMA.RAM), + sprintf(es.Memory[i].NonUMA.VRAM), + } + } + tprint( + "ESTIMATE", + []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"}, + bd...) } } @@ -337,7 +395,7 @@ func tprint(title string, header []string, body ...[]string) { tb.SetAlignment(tablewriter.ALIGN_CENTER) tb.SetHeaderLine(true) tb.SetRowLine(true) - tb.SetAutoMergeCellsByColumnIndex([]int{0, 1, 2, 3, 4}) + tb.SetAutoMergeCells(true) tb.Append(append([]string{title}, header...)) for i := range body { tb.Append(append([]string{title}, body[i]...)) diff --git a/file.go b/file.go index f7ce060..569f8e4 100644 --- a/file.go +++ b/file.go @@ -13,7 +13,6 @@ import ( "strings" "time" - "github.com/dustin/go-humanize" "golang.org/x/exp/constraints" "github.com/thxcode/gguf-parser-go/util/bytex" @@ -298,8 +297,8 @@ func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption // ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face, // and returns a GGUFFile, or an error if any. -func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, model string, opts ...GGUFReadOption) (*GGUFFile, error) { - return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://huggingface.co/%s/resolve/main/%s", repo, model), opts...) +func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) { + return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://huggingface.co/%s/resolve/main/%s", repo, file), opts...) } func parseGGUFFile(s int64, f io.ReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, err error) { @@ -507,31 +506,72 @@ func (gf *GGUFFile) layers() GGUFLayerTensorInfos { return ret } +const ( + _KiBytes = 1 << ((iota + 1) * 10) + _MiBytes + _GiBytes + _TiBytes + _PiBytes +) + func (s GGUFBytesScalar) String() string { if s == 0 { return "0 B" } - return humanize.IBytes(uint64(s)) + b, u := float64(1), "B" + switch { + case s >= _PiBytes: + b = _PiBytes + u = "PiB" + case s >= _TiBytes: + b = _TiBytes + u = "TiB" + case s >= _GiBytes: + b = _GiBytes + u = "GiB" + case s >= _MiBytes: + b = _MiBytes + u = "MiB" + case s >= _KiBytes: + b = _KiBytes + u = "KiB" + } + f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) + return strings.TrimSuffix(f, ".00") + " " + u } +const ( + _Thousand = 1e3 + _Million = 1e6 + _Billion = 1e9 + _Trillion = 1e12 + _Quadrillion = 1e15 +) + func (s GGUFParametersScalar) String() string { if s == 0 { return "0" } + b, u := float64(1), "" switch { - case s >= 1e15: - return humanize.CommafWithDigits(float64(s)/1e15, 1) + " Q" - case s >= 1e12: - return humanize.CommafWithDigits(float64(s)/1e12, 1) + " T" - case s >= 1e9: - return humanize.CommafWithDigits(float64(s)/1e9, 1) + " B" - case s >= 1e6: - return humanize.CommafWithDigits(float64(s)/1e6, 1) + " M" - case s >= 1e3: - return humanize.CommafWithDigits(float64(s)/1e3, 1) + " K" - default: - return strconv.Itoa(int(s)) - } + case s >= _Quadrillion: + b = _Quadrillion + u = "Q" + case s >= _Trillion: + b = _Trillion + u = "T" + case s >= _Billion: + b = _Billion + u = "B" + case s >= _Million: + b = _Million + u = "M" + case s >= _Thousand: + b = _Thousand + u = "K" + } + f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) + return strings.TrimSuffix(f, ".00") + " " + u } func (s GGUFBitsPerWeightScalar) String() string { diff --git a/file_estimate.go b/file_estimate.go index 96bb9db..8a398d6 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -16,14 +16,16 @@ type ( // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` + // ContextSize is the size of the context. + ContextSize uint64 `json:"contextSize"` // FullOffload is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffload bool `json:"fullOffload"` + // OffloadLayers is the number of offloaded layers. + OffloadLayers uint64 `json:"offloadLayers"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` - // ContextSize is the size of the context. - ContextSize uint64 `json:"contextSize"` // Load is the memory usage for running the GGUF file in RAM. Load LLaMACppMemoryUsage `json:"load"` // Offload is the memory usage for loading the GGUF file in VRAM. @@ -86,7 +88,21 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( o.CacheValueType = ptr.To(GGMLTypeF16) } - a, t := gf.Architecture(), gf.Tokenizer() + // Architecture and tokenizer metadata. + var ( + a GGUFArchitectureMetadata + t GGUFTokenizerMetadata + ) + if o.Architecture != nil { + a = *o.Architecture + } else { + a = gf.Architecture() + } + if o.Tokenizer != nil { + t = *o.Tokenizer + } else { + t = gf.Tokenizer() + } e.Architecture = a.Architecture // Flash attention. @@ -163,6 +179,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( nLoadLayers -= nOffloadLayers e.FullOffload = isOffloadOutputLayer && nLoadLayers == 0 + e.OffloadLayers = nOffloadLayers } // Footprint. @@ -368,47 +385,56 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( return e } -// LLaMACppUsageEstimateSummery represents the summary of the usage for loading the GGUF file in llama.cpp. -type LLaMACppUsageEstimateSummery struct { - /* Basic */ - - // UMA represents the usage of Unified Memory Architecture. - UMA GGUFBytesScalar `json:"uma"` - // NonUMA represents the usage of Non-Unified Memory Architecture. - NonUMA struct { - // Load is the memory usage for loading the GGUF file in Load. - RAM GGUFBytesScalar `json:"ram"` - // VRAM is the memory usage for loading the GGUF file in VRAM. - VRAM GGUFBytesScalar `json:"vram"` - } `json:"nonUMA"` - - /* Appendix */ - - // Architecture describes what architecture this model implements. - Architecture string `json:"architecture"` - // FlashAttention is the flag to indicate whether enable the flash attention, - // true for enable. - FlashAttention bool `json:"flashAttention"` - // FullOffload is the flag to indicate whether the layers are fully offloaded, - // false for partial offloaded or zero offloaded. - FullOffload bool `json:"fullOffload"` - // NoMMap is the flag to indicate whether the file must be loaded without mmap, - // true for total loaded. - NoMMap bool `json:"noMMap"` - // ContextSize is the size of the context. - ContextSize uint64 `json:"contextSize"` -} +// Types for LLaMACpp estimated summary. +type ( + // LLaMACppUsageEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp. + LLaMACppUsageEstimateSummary struct { + /* Basic */ + + Memory []LLaMACppUsageEstimateMemorySummary `json:"memory"` + + /* Appendix */ + + // Architecture describes what architecture this model implements. + Architecture string `json:"architecture"` + // ContextSize is the size of the context. + ContextSize uint64 `json:"contextSize"` + // FlashAttention is the flag to indicate whether enable the flash attention, + // true for enable. + FlashAttention bool `json:"flashAttention"` + // NoMMap is the flag to indicate whether the file must be loaded without mmap, + // true for total loaded. + NoMMap bool `json:"noMMap"` + } + + // LLaMACppUsageEstimateMemorySummary represents the memory summary of the usage for loading the GGUF file in llama.cpp. + LLaMACppUsageEstimateMemorySummary struct { + // OffloadLayers is the number of offloaded layers. + OffloadLayers uint64 `json:"offloadLayers"` + // UMA represents the usage of Unified Memory Architecture. + UMA GGUFBytesScalar `json:"uma"` + // NonUMA represents the usage of Non-Unified Memory Architecture. + NonUMA struct { + // Load is the memory usage for loading the GGUF file in Load. + RAM GGUFBytesScalar `json:"ram"` + // VRAM is the memory usage for loading the GGUF file in VRAM. + VRAM GGUFBytesScalar `json:"vram"` + } `json:"nonUMA"` + } +) + +func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEstimateMemorySummary) { + ems.OffloadLayers = e.OffloadLayers -func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummery) { // UMA. { fp := e.Load.Footprint + e.Offload.Footprint wg := e.Load.Weight.Sum() + e.Offload.Weight.Sum() kv := e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() cp := e.Load.Computation.Sum() - es.UMA = fp + wg + kv + cp + ems.UMA = fp + wg + kv + cp if !e.NoMMap && mmap { - es.UMA -= wg + ems.UMA -= wg } } @@ -431,24 +457,32 @@ func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSum wg := e.Load.Weight.Sum() kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() - es.NonUMA.RAM = fp + wg + kv + cp + ems.NonUMA.RAM = fp + wg + kv + cp if !e.NoMMap && (mmap || e.FullOffload) { - es.NonUMA.RAM -= wg + ems.NonUMA.RAM -= wg } // VRAM. fp = e.Offload.Footprint wg = e.Offload.Weight.Sum() kv = e.Offload.KVCache.Sum() cp = e.Offload.Computation.Sum() - es.NonUMA.VRAM = fp + wg + kv + cp + ems.NonUMA.VRAM = fp + wg + kv + cp + } + + return ems +} + +func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummary) { + // Summarize memory. + es.Memory = []LLaMACppUsageEstimateMemorySummary{ + e.SummarizeMemory(mmap), } // Just copy from the original estimate. es.Architecture = e.Architecture + es.ContextSize = e.ContextSize es.FlashAttention = e.FlashAttention - es.FullOffload = e.FullOffload es.NoMMap = e.NoMMap - es.ContextSize = e.ContextSize return es } diff --git a/file_estimate_option.go b/file_estimate_option.go index 3c48053..813b54b 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -6,6 +6,8 @@ import ( type ( _LLaMACppUsageEstimateOptions struct { + Architecture *GGUFArchitectureMetadata + Tokenizer *GGUFTokenizerMetadata ContextSize *int32 BatchSize *int32 ParallelSize *int32 @@ -17,6 +19,24 @@ type ( LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions) ) +// WithArchitecture sets the architecture for the estimate. +// +// Allows reusing the same GGUFArchitectureMetadata for multiple estimates. +func WithArchitecture(arch GGUFArchitectureMetadata) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + o.Architecture = &arch + } +} + +// WithTokenizer sets the tokenizer for the estimate. +// +// Allows reusing the same GGUFTokenizerMetadata for multiple estimates. +func WithTokenizer(tokenizer GGUFTokenizerMetadata) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + o.Tokenizer = &tokenizer + } +} + // WithContextSize sets the context size for the estimate. func WithContextSize(size int32) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) { diff --git a/go.mod b/go.mod index df64f18..2998fb2 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.22 require ( github.com/davecgh/go-spew v1.1.1 - github.com/dustin/go-humanize v1.0.1 github.com/henvic/httpretty v0.1.3 github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b github.com/stretchr/testify v1.9.0 diff --git a/go.sum b/go.sum index e2f09c1..467e640 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/henvic/httpretty v0.1.3 h1:4A6vigjz6Q/+yAfTD4wqipCv+Px69C7Th/NhT0ApuU8= github.com/henvic/httpretty v0.1.3/go.mod h1:UUEv7c2kHZ5SPQ51uS3wBpzPDibg2U3Y+IaXyHy5GBg= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=