diff --git a/Makefile b/Makefile index 2f9b858..e88928d 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,7 @@ gguf-parser: GOOS="$$os" GOARCH="$$arch" CGO_ENABLED=1 go build \ -trimpath \ -ldflags="-w -s -X main.Version=$(VERSION)" \ - -tags="$$tags" \ + -tags="urfave_cli_no_docs $$tags" \ -o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \ done; \ if [[ $$os == "darwin" ]]; then \ diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index 41ac2f4..3858e19 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -6,103 +6,86 @@ Review/Check/Estimate [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/ ```shell $ gguf-parser --help -Usage of gguf-parser ...: - -ctx-size int - Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default -1) - -debug - Enable debugging, verbosity. - -draft-path string - Path where the GGUF file to load for the draft model, optional, e.g. ~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf - -draft-url string - Url where the GGUF file to load for the draft model, optional, e.g. https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file. - -flash-attention - Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. - -gpu-layers int - Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default -1) - -gpu-layers-draft int - Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default -1) - -gpu-layers-step uint - Specify the step of layers to offload, works with --gpu-layers. - -hf-draft-file string - Model file below the --hf-draft-repo, optional, e.g. Qwen2-1.5B-Instruct.Q5_K_M.gguf. - -hf-draft-repo string - Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. QuantFactory/Qwen2-1.5B-Instruct-GGUF, works with --hf-draft-file. - -hf-file string - Model file below the --hf-repo, e.g. Qwen2-7B-Instruct.Q5_K_M.gguf. - -hf-mmproj-file string - Multimodal projector file below the --hf-repo. - -hf-repo string - Repository of HuggingFace which the GGUF file store for the main model, e.g. QuantFactory/Qwen2-7B-Instruct-GGUF, works with --hf-file. - -hf-token string - User access token of HuggingFace, optional, works with --hf-repo/--hf-file pair or --hf-draft-repo/--hf-draft-file pair. See https://huggingface.co/settings/tokens. - -in-max-ctx-size - Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. - -in-mib - Display the estimated result in table with MiB. - -json - Output as JSON. - -json-pretty - Output as pretty JSON. (default true) - -kv-type string - Specify the type of Key-Value cache, which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], default is f16. Use quantization type means enabling --flash-attention as well. (default "f16") - -mmproj-path string - Path where the GGUF file to load for the multimodal projector, optional. - -mmproj-url string - Url where the GGUF file to load for the multimodal projector, optional. - -ms-draft-file string - Model file below the --ms-draft-repo, optional, e.g. qwen1_5-1_8b-chat-q5_k_m.gguf. - -ms-draft-repo string - Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. qwen/Qwen1.5-1.8B-Chat-GGUF, works with --ms-draft-file. - -ms-file string - Model file below the --ms-repo, e.g. qwen1_5-7b-chat-q5_k_m.gguf. - -ms-mmproj-file string - Multimodal projector file below the --ms-repo. - -ms-repo string - Repository of ModelScope which the GGUF file store for the main model, e.g. qwen/Qwen1.5-7B-Chat-GGUF, works with --ms-file. - -ms-token string - Git access token of ModelScope, optional, works with --ms-repo/--ms-file pair or --ms-draft-repo/--ms-draft-file pair. See https://modelscope.cn/my/myaccesstoken. - -no-kv-offload - Specify disabling Key-Value offloading, which is used to estimate the usage. Key-Value offloading can reduce the usage of VRAM. - -no-mmap - Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. - -ol-model string - Model name of Ollama, e.g. gemma2. - -ol-usage - Specify respecting the extending layers introduced by Ollama, works with --ol-model, which affects the usage estimation. - -parallel-size int - Specify the number of parallel sequences to decode, which is used to estimate the usage, default is 1. (default 1) - -path string - Path where the GGUF file to load for the main model, e.g. ~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf. - -platform-footprint cudaMemGetInfo - Specify the platform footprint(RAM,VRAM) in MiB, which is used to estimate the NonUMA usage, default is 150,250. Different platform always gets different RAM and VRAM footprints, for example, within CUDA, cudaMemGetInfo would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default "150,250") - -raw - Output the file only, skip anything. - -skip-architecture - Skip to display architecture metadata. - -skip-cache - Skip cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the read result. - -skip-dns-cache - Skip DNS cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the DNS lookup result. - -skip-estimate - Skip to estimate. - -skip-model - Skip to display model metadata. - -skip-proxy - Skip proxy settings, works with --url/--hf-*/--ms-*/--ol-*, default is respecting the environment variables HTTP_PROXY/HTTPS_PROXY/NO_PROXY. - -skip-rang-download-detect - Skip range download detect, works with --url/--hf-*/--ms-*/--ol-*, default is detecting the range download support. - -skip-tls-verify - Skip TLS verification, works with --url/--hf-*/--ms-*/--ol-*, default is verifying the TLS certificate on HTTPs request. - -skip-tokenizer - Skip to display tokenizer metadata - -token string - Bearer auth token to load GGUF file, optional, works with --url/--draft-url. - -ubatch-size int - Specify the physical maximum batch size, which is used to estimate the usage, default is 512. (default 512) - -url string - Url where the GGUF file to load for the main model, e.g. https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file. - -version - Show gguf-parser version. +NAME: + gguf-parser - Review/Check/Estimate the GGUF file. + +USAGE: + gguf-parser [global options] + +GLOBAL OPTIONS: + --debug Enable debugging, verbosity. (default: false) + --help, -h Print the usage. + --version, -v Print the version. + + Estimate + + --ctx-size value, -c value Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default: -1) + --flash-attention, --fa Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false) + --gpu-layers value, --ngl value Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1) + --gpu-layers-draft value, --ngld value Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default: -1) + --gpu-layers-step value Specify the step of layers to offload, works with --gpu-layers. (default: 0) + --in-max-ctx-size Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. (default: false) + --kv-type value Specify the type of Key-Value cache, which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1] (default: "f16") + --no-kv-offload, --nkvo Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false) + --no-mmap Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false) + --parallel-size value, --parallel value, --np value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1) + --platform-footprint value Specify the platform footprint(RAM,VRAM) in MiB, which is used to estimate the NonUMA usage, default is 150,250. Different platform always gets different RAM and VRAM footprints, for example, within CUDA, 'cudaMemGetInfo' would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250") + --ubatch-size value, --ub value Specify the physical maximum batch size, which is used to estimate the usage. (default: 512) + + Load + + --skip-cache Skip cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the read result. (default: false) + --skip-dns-cache Skip DNS cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the DNS lookup result. (default: false) + --skip-proxy Skip proxy settings, works with --url/--hf-*/--ms-*/--ol-*, default is respecting the environment variables HTTP_PROXY/HTTPS_PROXY/NO_PROXY. (default: false) + --skip-rang-download-detect Skip range download detect, works with --url/--hf-*/--ms-*/--ol-*, default is detecting the range download support. (default: false) + --skip-tls-verify Skip TLS verification, works with --url/--hf-*/--ms-*/--ol-*, default is verifying the TLS certificate on HTTPs request. (default: false) + + Model/Local + + --draft-path value, --model-draft value, --md value Path where the GGUF file to load for the draft model, optional, e.g. ~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf + --mmproj-path value, --mmproj value Path where the GGUF file to load for the multimodal projector, optional. + --path value, --model value, -m value Path where the GGUF file to load for the main model, e.g. ~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf. + + Model/Remote + + --draft-url value Url where the GGUF file to load for the draft model, optional, e.g. https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file. + --mmproj-url value Url where the GGUF file to load for the multimodal projector, optional. + --token value Bearer auth token to load GGUF file, optional, works with --url/--draft-url. + --url value, --model-url value, --mu value Url where the GGUF file to load for the main model, e.g. https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file. + + Model/Remote/HuggingFace + + --hf-draft-file value Model file below the --hf-draft-repo, optional, e.g. Qwen2-1.5B-Instruct.Q5_K_M.gguf. + --hf-draft-repo value Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. QuantFactory/Qwen2-1.5B-Instruct-GGUF, works with --hf-draft-file. + --hf-file value, --hff value Model file below the --hf-repo, e.g. Qwen2-7B-Instruct.Q5_K_M.gguf. + --hf-mmproj-file value Multimodal projector file below the --hf-repo. + --hf-repo value, --hfr value Repository of HuggingFace which the GGUF file store for the main model, e.g. QuantFactory/Qwen2-7B-Instruct-GGUF, works with --hf-file. + --hf-token value, --hft value User access token of HuggingFace, optional, works with --hf-repo/--hf-file pair or --hf-draft-repo/--hf-draft-file pair. See https://huggingface.co/settings/tokens. + + Model/Remote/ModelScope + + --ms-draft-file value Model file below the --ms-draft-repo, optional, e.g. qwen1_5-1_8b-chat-q5_k_m.gguf. + --ms-draft-repo value Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. qwen/Qwen1.5-1.8B-Chat-GGUF, works with --ms-draft-file. + --ms-file value Model file below the --ms-repo, e.g. qwen1_5-7b-chat-q5_k_m.gguf. + --ms-mmproj-file value Multimodal projector file below the --ms-repo. + --ms-repo value Repository of ModelScope which the GGUF file store for the main model, e.g. qwen/Qwen1.5-7B-Chat-GGUF, works with --ms-file. + --ms-token value Git access token of ModelScope, optional, works with --ms-repo/--ms-file pair or --ms-draft-repo/--ms-draft-file pair. See https://modelscope.cn/my/myaccesstoken. + + Model/Remote/Ollama + + --ol-model value Model name of Ollama, e.g. gemma2. + --ol-usage Specify respecting the extending layers introduced by Ollama, works with --ol-model, which affects the usage estimation. (default: false) + + Output + + --in-mib Display the estimated result in table with MiB. (default: false) + --json (default: false) + --json-pretty Output as pretty JSON. (default: true) + --raw Output the file in JSON only, skip anything. (default: false) + --skip-architecture Skip to display architecture metadata. (default: false) + --skip-estimate Skip to estimate. (default: false) + --skip-model Skip to display model metadata. (default: false) + --skip-tokenizer Skip to display tokenizer metadata. (default: false) ``` diff --git a/cmd/gguf-parser/go.mod b/cmd/gguf-parser/go.mod index 8b30146..3c4fded 100644 --- a/cmd/gguf-parser/go.mod +++ b/cmd/gguf-parser/go.mod @@ -7,17 +7,22 @@ replace github.com/thxcode/gguf-parser-go => ../../ require ( github.com/olekukonko/tablewriter v0.0.5 github.com/thxcode/gguf-parser-go v0.0.0-00010101000000-000000000000 + github.com/urfave/cli/v2 v2.27.2 ) require ( + github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/henvic/httpretty v0.1.3 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/mattn/go-runewidth v0.0.9 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/rivo/uniseg v0.4.7 // indirect github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b // indirect - golang.org/x/exp v0.0.0-20240707233637-46b078467d37 // indirect + github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.19.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/sys v0.22.0 // indirect diff --git a/cmd/gguf-parser/go.sum b/cmd/gguf-parser/go.sum index 649da1f..3e54400 100644 --- a/cmd/gguf-parser/go.sum +++ b/cmd/gguf-parser/go.sum @@ -1,3 +1,5 @@ +github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -6,8 +8,9 @@ github.com/henvic/httpretty v0.1.3 h1:4A6vigjz6Q/+yAfTD4wqipCv+Px69C7Th/NhT0ApuU github.com/henvic/httpretty v0.1.3/go.mod h1:UUEv7c2kHZ5SPQ51uS3wBpzPDibg2U3Y+IaXyHy5GBg= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= +github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -17,16 +20,25 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8= github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b h1:e9eeuSYSLmUKxy7ALzKcxo7ggTceQaVcBhjDIcewa9c= github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -golang.org/x/exp v0.0.0-20240707233637-46b078467d37 h1:uLDX+AfeFCct3a2C7uIWBKMJIR3CJMhcgfrUAqjRK6w= -golang.org/x/exp v0.0.0-20240707233637-46b078467d37/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= +github.com/urfave/cli/v2 v2.27.2 h1:6e0H+AkS+zDckwPCUrZkKX38mRaau4nL2uipkJpbkcI= +github.com/urfave/cli/v2 v2.27.2/go.mod h1:g0+79LmHHATl7DAcHO99smiR/T7uGLw84w8Y42x+4eM= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= +github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 36346e1..e8b1ce2 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -1,7 +1,6 @@ package main import ( - "flag" "os" "fmt" "context" @@ -9,11 +8,15 @@ import ( "strings" "sync" "regexp" + "errors" + "path/filepath" + "github.com/urfave/cli/v2" "github.com/olekukonko/tablewriter" "github.com/thxcode/gguf-parser-go/util/anyx" "github.com/thxcode/gguf-parser-go/util/json" + "github.com/thxcode/gguf-parser-go/util/signalx" . "github.com/thxcode/gguf-parser-go" ) @@ -21,188 +24,506 @@ import ( var Version = "v0.0.0" func main() { - ctx := context.Background() - - // Parse arguments. + name := filepath.Base(os.Args[0]) + app := &cli.App{ + Name: name, + Usage: "Review/Check/Estimate the GGUF file.", + UsageText: name + " [global options]", + Version: Version, + UseShortOptionHandling: true, + HideVersion: true, + HideHelp: true, + Reader: os.Stdin, + Writer: os.Stdout, + ErrWriter: os.Stderr, + OnUsageError: func(c *cli.Context, _ error, _ bool) error { + return cli.ShowAppHelp(c) + }, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "help", + Aliases: []string{"h"}, + Usage: "Print the usage.", + DisableDefaultText: true, + }, + &cli.BoolFlag{ + Name: "version", + Aliases: []string{"v"}, + Usage: "Print the version.", + DisableDefaultText: true, + }, + &cli.BoolFlag{ + Destination: &debug, + Value: debug, + Name: "debug", + Usage: "Enable debugging, verbosity.", + }, + &cli.StringFlag{ + Destination: &path, + Value: path, + Category: "Model/Local", + Name: "path", + Aliases: []string{"model", "m"}, + Usage: "Path where the GGUF file to load for the main model, e.g. ~/.cache" + + "/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF" + + "/Qwen2-7B-Instruct.Q5_K_M.gguf.", + }, + &cli.StringFlag{ + Destination: &draftPath, + Value: draftPath, + Category: "Model/Local", + Name: "draft-path", + Aliases: []string{"model-draft", "md"}, + Usage: "Path where the GGUF file to load for the draft model, optional, e.g. ~/.cache" + + "/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF" + + "/Qwen2-1.5B-Instruct.Q5_K_M.gguf", + }, + &cli.StringFlag{ + Destination: &mmprojPath, + Value: mmprojPath, + Category: "Model/Local", + Name: "mmproj-path", + Aliases: []string{"mmproj"}, + Usage: "Path where the GGUF file to load for the multimodal projector, optional.", + }, + &cli.StringFlag{ + Destination: &url, + Value: url, + Category: "Model/Remote", + Name: "url", + Aliases: []string{"model-url", "mu"}, + Usage: "Url where the GGUF file to load for the main model, e.g. " + + "https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF" + + "/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf. " + + "Note that gguf-parser does not need to download the entire GGUF file.", + }, + &cli.StringFlag{ + Destination: &draftUrl, + Value: draftUrl, + Category: "Model/Remote", + Name: "draft-url", + Usage: "Url where the GGUF file to load for the draft model, optional, e.g. " + + "https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF" + + "/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf. " + + "Note that gguf-parser does not need to download the entire GGUF file.", + }, + &cli.StringFlag{ + Destination: &mmprojUrl, + Value: mmprojUrl, + Category: "Model/Remote", + Name: "mmproj-url", + Usage: "Url where the GGUF file to load for the multimodal projector, optional.", + }, + &cli.StringFlag{ + Destination: &token, + Value: token, + Category: "Model/Remote", + Name: "token", + Usage: "Bearer auth token to load GGUF file, optional, " + + "works with --url/--draft-url.", + }, + &cli.StringFlag{ + Destination: &hfRepo, + Value: hfRepo, + Category: "Model/Remote/HuggingFace", + Name: "hf-repo", + Aliases: []string{"hfr"}, + Usage: "Repository of HuggingFace which the GGUF file store for the main model, e.g. " + + "QuantFactory/Qwen2-7B-Instruct-GGUF, works with --hf-file.", + }, + &cli.StringFlag{ + Destination: &hfFile, + Value: hfFile, + Category: "Model/Remote/HuggingFace", + Name: "hf-file", + Aliases: []string{"hff"}, + Usage: "Model file below the --hf-repo, e.g. " + + "Qwen2-7B-Instruct.Q5_K_M.gguf.", + }, + &cli.StringFlag{ + Destination: &hfMMProjFile, + Value: hfMMProjFile, + Category: "Model/Remote/HuggingFace", + Name: "hf-mmproj-file", + Usage: "Multimodal projector file below the --hf-repo.", + }, + &cli.StringFlag{ + Destination: &hfDraftRepo, + Value: hfDraftRepo, + Category: "Model/Remote/HuggingFace", + Name: "hf-draft-repo", + Usage: "Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. " + + "QuantFactory/Qwen2-1.5B-Instruct-GGUF, works with --hf-draft-file.", + }, + &cli.StringFlag{ + Destination: &hfDraftFile, + Value: hfDraftFile, + Category: "Model/Remote/HuggingFace", + Name: "hf-draft-file", + Usage: "Model file below the --hf-draft-repo, optional, e.g. " + + "Qwen2-1.5B-Instruct.Q5_K_M.gguf.", + }, + &cli.StringFlag{ + Destination: &hfToken, + Value: hfToken, + Category: "Model/Remote/HuggingFace", + Name: "hf-token", + Aliases: []string{"hft"}, + Usage: "User access token of HuggingFace, optional, " + + "works with --hf-repo/--hf-file pair or --hf-draft-repo/--hf-draft-file pair. " + + "See https://huggingface.co/settings/tokens.", + }, + &cli.StringFlag{ + Destination: &msRepo, + Value: msRepo, + Category: "Model/Remote/ModelScope", + Name: "ms-repo", + Usage: "Repository of ModelScope which the GGUF file store for the main model, e.g. " + + "qwen/Qwen1.5-7B-Chat-GGUF, works with --ms-file.", + }, + &cli.StringFlag{ + Destination: &msFile, + Value: msFile, + Category: "Model/Remote/ModelScope", + Name: "ms-file", + Usage: "Model file below the --ms-repo, e.g. " + + "qwen1_5-7b-chat-q5_k_m.gguf.", + }, + &cli.StringFlag{ + Destination: &msMMProjFile, + Value: msMMProjFile, + Category: "Model/Remote/ModelScope", + Name: "ms-mmproj-file", + Usage: "Multimodal projector file below the --ms-repo.", + }, + &cli.StringFlag{ + Destination: &msDraftRepo, + Value: msDraftRepo, + Category: "Model/Remote/ModelScope", + Name: "ms-draft-repo", + Usage: "Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. " + + "qwen/Qwen1.5-1.8B-Chat-GGUF, works with --ms-draft-file.", + }, + &cli.StringFlag{ + Destination: &msDraftFile, + Value: msDraftFile, + Category: "Model/Remote/ModelScope", + Name: "ms-draft-file", + Usage: "Model file below the --ms-draft-repo, optional, e.g. " + + "qwen1_5-1_8b-chat-q5_k_m.gguf.", + }, + &cli.StringFlag{ + Destination: &msToken, + Value: msToken, + Category: "Model/Remote/ModelScope", + Name: "ms-token", + Usage: "Git access token of ModelScope, optional, " + + "works with --ms-repo/--ms-file pair or --ms-draft-repo/--ms-draft-file pair. " + + "See https://modelscope.cn/my/myaccesstoken.", + }, + &cli.StringFlag{ + Destination: &olModel, + Value: olModel, + Category: "Model/Remote/Ollama", + Name: "ol-model", + Usage: "Model name of Ollama, e.g. " + + "gemma2.", + }, + &cli.BoolFlag{ + Destination: &olUsage, + Value: olUsage, + Category: "Model/Remote/Ollama", + Name: "ol-usage", + Usage: "Specify respecting the extending layers introduced by Ollama, " + + "works with --ol-model, which affects the usage estimation.", + }, + &cli.BoolFlag{ + Destination: &skipProxy, + Value: skipProxy, + Category: "Load", + Name: "skip-proxy", + Usage: "Skip proxy settings, " + + "works with --url/--hf-*/--ms-*/--ol-*, " + + "default is respecting the environment variables HTTP_PROXY/HTTPS_PROXY/NO_PROXY.", + }, + &cli.BoolFlag{ + Destination: &skipTLSVerify, + Value: skipTLSVerify, + Category: "Load", + Name: "skip-tls-verify", + Usage: "Skip TLS verification, " + + "works with --url/--hf-*/--ms-*/--ol-*, " + + "default is verifying the TLS certificate on HTTPs request.", + }, + &cli.BoolFlag{ + Destination: &skipDNSCache, + Value: skipDNSCache, + Category: "Load", + Name: "skip-dns-cache", + Usage: "Skip DNS cache, " + + "works with --url/--hf-*/--ms-*/--ol-*, " + + "default is caching the DNS lookup result.", + }, + &cli.BoolFlag{ + Destination: &skipRangDownloadDetect, + Value: skipRangDownloadDetect, + Category: "Load", + Name: "skip-rang-download-detect", + Usage: "Skip range download detect, " + + "works with --url/--hf-*/--ms-*/--ol-*, " + + "default is detecting the range download support.", + }, + &cli.BoolFlag{ + Destination: &skipCache, + Value: skipCache, + Category: "Load", + Name: "skip-cache", + Usage: "Skip cache, " + + "works with --url/--hf-*/--ms-*/--ol-*, " + + "default is caching the read result.", + }, + &cli.IntFlag{ + Destination: &ctxSize, + Value: ctxSize, + Category: "Estimate", + Name: "ctx-size", + Aliases: []string{"c"}, + Usage: "Specify the size of prompt context, " + + "which is used to estimate the usage, " + + "default is equal to the model's maximum context size.", + }, + &cli.BoolFlag{ + Destination: &inMaxCtxSize, + Value: inMaxCtxSize, + Category: "Estimate", + Name: "in-max-ctx-size", + Usage: "Limit the context size to the maximum context size of the model, " + + "if the context size is larger than the maximum context size.", + }, + &cli.IntFlag{ + Destination: &physicalBatchSize, + Value: physicalBatchSize, + Category: "Estimate", + Name: "ubatch-size", + Aliases: []string{"ub"}, + Usage: "Specify the physical maximum batch size, " + + "which is used to estimate the usage.", + }, + &cli.IntFlag{ + Destination: ¶llelSize, + Value: parallelSize, + Category: "Estimate", + Name: "parallel-size", + Aliases: []string{"parallel", "np"}, + Usage: "Specify the number of parallel sequences to decode, " + + "which is used to estimate the usage.", + }, + &cli.StringFlag{ + Destination: &kvType, + Value: kvType, + Category: "Estimate", + Name: "kv-type", + Usage: "Specify the type of Key-Value cache, " + + "which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]", + }, + &cli.BoolFlag{ + Destination: &noKVOffload, + Value: noKVOffload, + Category: "Estimate", + Name: "no-kv-offload", + Aliases: []string{"nkvo"}, + Usage: "Specify disabling Key-Value offloading, " + + "which is used to estimate the usage. " + + "Disable Key-Value offloading can reduce the usage of VRAM.", + }, + &cli.BoolFlag{ + Destination: &flashAttention, + Value: flashAttention, + Category: "Estimate", + Name: "flash-attention", + Aliases: []string{"fa"}, + Usage: "Specify enabling Flash Attention, " + + "which is used to estimate the usage. " + + "Flash Attention can reduce the usage of RAM/VRAM.", + }, + &cli.StringFlag{ + Destination: &platformFootprint, + Value: platformFootprint, + Category: "Estimate", + Name: "platform-footprint", + Usage: "Specify the platform footprint(RAM,VRAM) in MiB, " + + "which is used to estimate the NonUMA usage, " + + "default is 150,250. " + + "Different platform always gets different RAM and VRAM footprints, " + + "for example, within CUDA, 'cudaMemGetInfo' would occupy some RAM and VRAM, " + + "see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.", + }, + &cli.BoolFlag{ + Destination: &noMMap, + Value: noMMap, + Category: "Estimate", + Name: "no-mmap", + Usage: "Specify disabling Memory-Mapped using, " + + "which is used to estimate the usage. " + + "Memory-Mapped can avoid loading the entire model weights into RAM.", + }, + &cli.IntFlag{ + Destination: &offloadLayers, + Value: offloadLayers, + Category: "Estimate", + Name: "gpu-layers", + Aliases: []string{"ngl"}, + Usage: "Specify how many layers of the main model to offload, " + + "which is used to estimate the usage, " + + "default is full offloaded.", + }, + &cli.IntFlag{ + Destination: &offloadLayersDraft, + Value: offloadLayersDraft, + Category: "Estimate", + Name: "gpu-layers-draft", + Aliases: []string{"ngld"}, + Usage: "Specify how many layers of the draft model to offload, " + + "which is used to estimate the usage, " + + "default is full offloaded.", + }, + &cli.Uint64Flag{ + Destination: &offloadLayersStep, + Value: offloadLayersStep, + Category: "Estimate", + Name: "gpu-layers-step", + Usage: "Specify the step of layers to offload, " + + "works with --gpu-layers.", + }, + &cli.BoolFlag{ + Destination: &raw, + Value: raw, + Category: "Output", + Name: "raw", + Usage: "Output the file in JSON only, skip anything.", + }, + &cli.BoolFlag{ + Destination: &skipModel, + Value: skipModel, + Category: "Output", + Name: "skip-model", + Usage: "Skip to display model metadata.", + }, + &cli.BoolFlag{ + Destination: &skipArchitecture, + Value: skipArchitecture, + Category: "Output", + Name: "skip-architecture", + Usage: "Skip to display architecture metadata.", + }, + &cli.BoolFlag{ + Destination: &skipTokenizer, + Value: skipTokenizer, + Category: "Output", + Name: "skip-tokenizer", + Usage: "Skip to display tokenizer metadata.", + }, + &cli.BoolFlag{ + Destination: &skipEstimate, + Value: skipEstimate, + Category: "Output", + Name: "skip-estimate", + Usage: "Skip to estimate.", + }, + &cli.BoolFlag{ + Destination: &inMib, + Value: inMib, + Category: "Output", + Name: "in-mib", + Usage: "Display the estimated result in table with MiB.", + }, + &cli.BoolFlag{ + Destination: &inJson, + Value: inJson, + Category: "Output", + Name: "json", + }, + &cli.BoolFlag{ + Destination: &inPrettyJson, + Value: inPrettyJson, + Category: "Output", + Name: "json-pretty", + Usage: "Output as pretty JSON.", + }, + }, + Action: func(c *cli.Context) error { + if c.Bool("help") { + return cli.ShowAppHelp(c) + } + if c.Bool("version") { + cli.ShowVersion(c) + return nil + } + return run(c.Context) + }, + } - var ( - // model options - path string - mmprojPath string // for estimate - draftPath string // for estimate - url string - mmprojUrl string // for estimate - draftUrl string // for estimate - token string - hfRepo string - hfFile string - hfMMProjFile string // for estimate - hfDraftRepo string // for estimate - hfDraftFile string // for estimate - hfToken string - msRepo string - msFile string - msMMProjFile string // for estimate - msDraftRepo string // for estimate - msDraftFile string // for estimate - msToken string - olModel string - olUsage bool - // read options - debug bool - skipProxy bool - skipTLSVerify bool - skipDNSCache bool - skipRangDownloadDetect bool - skipCache bool - // estimate options - ctxSize = -1 - inMaxCtxSize bool - physicalBatchSize = 512 - parallelSize = 1 - kvType = "f16" - noKVOffload bool - flashAttention bool - platformFootprint = "150,250" - noMMap bool - offloadLayers = -1 - offloadLayersDraft = -1 - offloadLayersStep uint64 - // output options - version bool - raw bool - skipModel bool - skipArchitecture bool - skipTokenizer bool - skipEstimate bool - inMib bool - inJson bool - inPrettyJson = true - ) - fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) - fs.Usage = func() { - _, _ = fmt.Fprintf(fs.Output(), "Usage of gguf-parser %v:\n", Version) - fs.PrintDefaults() - } - fs.StringVar(&path, "path", path, "Path where the GGUF file to load for the main model, e.g. ~/.cache"+ - "/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF"+ - "/Qwen2-7B-Instruct.Q5_K_M.gguf.") - fs.StringVar(&draftPath, "draft-path", draftPath, "Path where the GGUF file to load for the draft model, optional, e.g. ~/.cache"+ - "/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF"+ - "/Qwen2-1.5B-Instruct.Q5_K_M.gguf") - fs.StringVar(&mmprojPath, "mmproj-path", mmprojPath, "Path where the GGUF file to load for the multimodal projector, optional.") - fs.StringVar(&url, "url", url, "Url where the GGUF file to load for the main model, e.g. "+ - "https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF"+ - "/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf. "+ - "Note that gguf-parser does not need to download the entire GGUF file.") - fs.StringVar(&draftUrl, "draft-url", draftUrl, "Url where the GGUF file to load for the draft model, optional, e.g. "+ - "https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF"+ - "/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf. "+ - "Note that gguf-parser does not need to download the entire GGUF file.") - fs.StringVar(&mmprojUrl, "mmproj-url", mmprojUrl, "Url where the GGUF file to load for the multimodal projector, optional.") - fs.StringVar(&token, "token", token, "Bearer auth token to load GGUF file, optional, "+ - "works with --url/--draft-url.") - fs.StringVar(&hfRepo, "hf-repo", hfRepo, "Repository of HuggingFace which the GGUF file store for the main model, e.g. "+ - "QuantFactory/Qwen2-7B-Instruct-GGUF, works with --hf-file.") - fs.StringVar(&hfFile, "hf-file", hfFile, "Model file below the --hf-repo, e.g. "+ - "Qwen2-7B-Instruct.Q5_K_M.gguf.") - fs.StringVar(&hfMMProjFile, "hf-mmproj-file", hfMMProjFile, "Multimodal projector file below the --hf-repo.") - fs.StringVar(&hfDraftRepo, "hf-draft-repo", hfDraftRepo, "Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. "+ - "QuantFactory/Qwen2-1.5B-Instruct-GGUF, works with --hf-draft-file.") - fs.StringVar(&hfDraftFile, "hf-draft-file", hfDraftFile, "Model file below the --hf-draft-repo, optional, e.g. "+ - "Qwen2-1.5B-Instruct.Q5_K_M.gguf.") - fs.StringVar(&hfToken, "hf-token", hfToken, "User access token of HuggingFace, optional, "+ - "works with --hf-repo/--hf-file pair or --hf-draft-repo/--hf-draft-file pair. "+ - "See https://huggingface.co/settings/tokens.") - fs.StringVar(&msRepo, "ms-repo", msRepo, "Repository of ModelScope which the GGUF file store for the main model, e.g. "+ - "qwen/Qwen1.5-7B-Chat-GGUF, works with --ms-file.") - fs.StringVar(&msFile, "ms-file", msFile, "Model file below the --ms-repo, e.g. "+ - "qwen1_5-7b-chat-q5_k_m.gguf.") - fs.StringVar(&msMMProjFile, "ms-mmproj-file", msMMProjFile, "Multimodal projector file below the --ms-repo.") - fs.StringVar(&msDraftRepo, "ms-draft-repo", msDraftRepo, "Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. "+ - "qwen/Qwen1.5-1.8B-Chat-GGUF, works with --ms-draft-file.") - fs.StringVar(&msDraftFile, "ms-draft-file", msDraftFile, "Model file below the --ms-draft-repo, optional, e.g. "+ - "qwen1_5-1_8b-chat-q5_k_m.gguf.") - fs.StringVar(&msToken, "ms-token", msToken, "Git access token of ModelScope, optional, "+ - "works with --ms-repo/--ms-file pair or --ms-draft-repo/--ms-draft-file pair. "+ - "See https://modelscope.cn/my/myaccesstoken.") - fs.StringVar(&olModel, "ol-model", olModel, "Model name of Ollama, e.g. "+ - "gemma2.") - fs.BoolVar(&olUsage, "ol-usage", olUsage, "Specify respecting the extending layers introduced by Ollama, "+ - "works with --ol-model, which affects the usage estimation.") - fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.") - fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip proxy settings, "+ - "works with --url/--hf-*/--ms-*/--ol-*, "+ - "default is respecting the environment variables HTTP_PROXY/HTTPS_PROXY/NO_PROXY.") - fs.BoolVar(&skipTLSVerify, "skip-tls-verify", skipTLSVerify, "Skip TLS verification, "+ - "works with --url/--hf-*/--ms-*/--ol-*, "+ - "default is verifying the TLS certificate on HTTPs request.") - fs.BoolVar(&skipDNSCache, "skip-dns-cache", skipDNSCache, "Skip DNS cache, "+ - "works with --url/--hf-*/--ms-*/--ol-*, "+ - "default is caching the DNS lookup result.") - fs.BoolVar(&skipRangDownloadDetect, "skip-rang-download-detect", skipRangDownloadDetect, "Skip range download detect, "+ - "works with --url/--hf-*/--ms-*/--ol-*, "+ - "default is detecting the range download support.") - fs.BoolVar(&skipCache, "skip-cache", skipCache, "Skip cache, "+ - "works with --url/--hf-*/--ms-*/--ol-*, "+ - "default is caching the read result.") - fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+ - "which is used to estimate the usage, "+ - "default is equal to the model's maximum context size.") - fs.BoolVar(&inMaxCtxSize, "in-max-ctx-size", inMaxCtxSize, "Limit the context size to the maximum context size of the model, "+ - "if the context size is larger than the maximum context size.") - fs.IntVar(&physicalBatchSize, "ubatch-size", physicalBatchSize, "Specify the physical maximum batch size, "+ - "which is used to estimate the usage, "+ - "default is 512.") - fs.IntVar(¶llelSize, "parallel-size", parallelSize, "Specify the number of parallel sequences to decode, "+ - "which is used to estimate the usage, "+ - "default is 1.") - fs.StringVar(&kvType, "kv-type", kvType, "Specify the type of Key-Value cache, "+ - "which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], "+ - "default is f16. "+ - "Use quantization type means enabling --flash-attention as well.") - fs.BoolVar(&noKVOffload, "no-kv-offload", noKVOffload, "Specify disabling Key-Value offloading, "+ - "which is used to estimate the usage. "+ - "Key-Value offloading can reduce the usage of VRAM.") - fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Specify enabling Flash Attention, "+ - "which is used to estimate the usage. "+ - "Flash Attention can reduce the usage of RAM/VRAM.") - fs.StringVar(&platformFootprint, "platform-footprint", platformFootprint, "Specify the platform footprint(RAM,VRAM) in MiB, "+ - "which is used to estimate the NonUMA usage, "+ - "default is 150,250. "+ - "Different platform always gets different RAM and VRAM footprints, "+ - "for example, within CUDA, `cudaMemGetInfo` would occupy some RAM and VRAM, "+ - "see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.") - fs.BoolVar(&noMMap, "no-mmap", noMMap, "Specify disabling Memory-Mapped using, "+ - "which is used to estimate the usage. "+ - "Memory-Mapped can avoid loading the entire model weights into RAM.") - fs.IntVar(&offloadLayers, "gpu-layers", offloadLayers, "Specify how many layers of the main model to offload, "+ - "which is used to estimate the usage, "+ - "default is full offloaded.") - fs.IntVar(&offloadLayersDraft, "gpu-layers-draft", offloadLayersDraft, "Specify how many layers of the draft model to offload, "+ - "which is used to estimate the usage, "+ - "default is full offloaded.") - fs.Uint64Var(&offloadLayersStep, "gpu-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ - "works with --gpu-layers.") - fs.BoolVar(&version, "version", version, "Show gguf-parser version.") - fs.BoolVar(&raw, "raw", raw, "Output the file only, skip anything.") - fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip to display model metadata.") - fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip to display architecture metadata.") - fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata") - fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip to estimate.") - fs.BoolVar(&inMib, "in-mib", inMib, "Display the estimated result in table with MiB.") - fs.BoolVar(&inJson, "json", inJson, "Output as JSON.") - fs.BoolVar(&inPrettyJson, "json-pretty", inPrettyJson, "Output as pretty JSON.") - if err := fs.Parse(os.Args[1:]); err != nil { - fmt.Println(err.Error()) + if err := app.RunContext(signalx.Handler(), os.Args); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } +} - if version { - fmt.Printf("gguf-parser %s\n", Version) - return - } +var ( + // model options + path string + mmprojPath string // for estimate + draftPath string // for estimate + url string + mmprojUrl string // for estimate + draftUrl string // for estimate + token string + hfRepo string + hfFile string + hfMMProjFile string // for estimate + hfDraftRepo string // for estimate + hfDraftFile string // for estimate + hfToken string + msRepo string + msFile string + msMMProjFile string // for estimate + msDraftRepo string // for estimate + msDraftFile string // for estimate + msToken string + olModel string + olUsage bool + // load options + debug bool + skipProxy bool + skipTLSVerify bool + skipDNSCache bool + skipRangDownloadDetect bool + skipCache bool + // estimate options + ctxSize = -1 + inMaxCtxSize bool + physicalBatchSize = 512 + parallelSize = 1 + kvType = "f16" + noKVOffload bool + flashAttention bool + platformFootprint = "150,250" + noMMap bool + offloadLayers = -1 + offloadLayersDraft = -1 + offloadLayersStep uint64 + // output options + raw bool + skipModel bool + skipArchitecture bool + skipTokenizer bool + skipEstimate bool + inMib bool + inJson bool + inPrettyJson = true +) +func run(ctx context.Context) error { // Prepare options. ropts := []GGUFReadOption{ @@ -285,8 +606,7 @@ func main() { // Main model. switch { default: - _, _ = fmt.Fprintf(os.Stderr, "no model specified\n") - os.Exit(1) + return errors.New("no model specified") case path != "": gf, err = ParseGGUFFile(path, ropts...) case url != "": @@ -330,8 +650,7 @@ func main() { } } if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to parse GGUF file: %s\n", err.Error()) - os.Exit(1) + return fmt.Errorf("failed to parse GGUF file: %w", err) } // MultimodalProjector model. @@ -346,8 +665,7 @@ func main() { mmpgf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...) } if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to parse multimodal projector GGUF file: %s\n", err.Error()) - os.Exit(1) + return fmt.Errorf("failed to parse multimodal projector GGUF file: %w", err) } // Drafter model. @@ -362,8 +680,7 @@ func main() { dftgf, err = ParseGGUFFileFromModelScope(ctx, msDraftRepo, msDraftFile, ropts...) } if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to parse draft GGUF file: %s\n", err.Error()) - os.Exit(1) + return fmt.Errorf("failed to parse draft GGUF file: %w", err) } } @@ -375,10 +692,9 @@ func main() { enc.SetIndent("", " ") } if err := enc.Encode(gf); err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to encode JSON: %s\n", err.Error()) - os.Exit(1) + return fmt.Errorf("failed to encode JSON: %w", err) } - return + return nil } // Otherwise, display the metadata and estimate the usage. @@ -490,11 +806,10 @@ func main() { enc.SetIndent("", " ") } if err := enc.Encode(o); err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to encode JSON: %s\n", err.Error()) - os.Exit(1) + return fmt.Errorf("failed to encode JSON: %w", err) } - return + return nil } InMiBytes = inMib @@ -636,6 +951,8 @@ func main() { mg, bds...) } + + return nil } func sprintf(f any, a ...any) string { diff --git a/util/signalx/handler.go b/util/signalx/handler.go new file mode 100644 index 0000000..d4efb6c --- /dev/null +++ b/util/signalx/handler.go @@ -0,0 +1,34 @@ +package signalx + +import ( + "context" + "os" + "os/signal" +) + +var registered = make(chan struct{}) + +// Handler registers for signals and returns a context. +func Handler() context.Context { + close(registered) // Panics when called twice. + + sigChan := make(chan os.Signal, len(sigs)) + ctx, cancel := context.WithCancel(context.Background()) + + // Register for signals. + signal.Notify(sigChan, sigs...) + + // Process signals. + go func() { + var exited bool + for range sigChan { + if exited { + os.Exit(1) + } + cancel() + exited = true + } + }() + + return ctx +} diff --git a/util/signalx/handler_posix.go b/util/signalx/handler_posix.go new file mode 100644 index 0000000..556658d --- /dev/null +++ b/util/signalx/handler_posix.go @@ -0,0 +1,10 @@ +//go:build !windows + +package signalx + +import ( + "os" + "syscall" +) + +var sigs = []os.Signal{syscall.SIGINT, syscall.SIGTERM} diff --git a/util/signalx/handler_windows.go b/util/signalx/handler_windows.go new file mode 100644 index 0000000..46483b9 --- /dev/null +++ b/util/signalx/handler_windows.go @@ -0,0 +1,8 @@ +package signalx + +import ( + "os" + "syscall" +) + +var sigs = []os.Signal{syscall.SIGINT, syscall.SIGTERM}