From ab75c5a366e2dbea9852dbae2bc6e715ab9a0f20 Mon Sep 17 00:00:00 2001 From: thxCode Date: Fri, 5 Jul 2024 13:56:28 +0800 Subject: [PATCH] refactor: support ollama modelfile usage estimate Signed-off-by: thxCode --- Makefile | 4 +- README.md | 7 +- cmd/gguf-parser/README.md | 140 ++++---- cmd/gguf-parser/main.go | 288 ++++++++++------ file.go | 31 +- file_architecture.go | 133 +++++++- file_estimate.go | 64 +++- file_estimate_option.go | 9 + file_from_metadata.go => file_from_distro.go | 96 +++--- file_from_remote.go | 2 +- file_model.go | 2 + go.mod | 2 +- ollama_model.go | 329 +++++++++++++++++-- util/anyx/any.go | 128 ++++++++ util/httpx/client.go | 3 + util/json/common.go | 2 + util/json/jsoniter.go | 3 +- util/json/stdjson.go | 2 - util/stringx/bytes.go | 14 + 19 files changed, 977 insertions(+), 282 deletions(-) rename file_from_metadata.go => file_from_distro.go (76%) create mode 100644 util/anyx/any.go create mode 100644 util/stringx/bytes.go diff --git a/Makefile b/Makefile index f70db18..2f9b858 100644 --- a/Makefile +++ b/Makefile @@ -70,8 +70,10 @@ gguf-parser: [[ -d "$(SRCDIR)/.dist" ]] || mkdir -p "$(SRCDIR)/.dist" cd "$(SRCDIR)/cmd/gguf-parser" && for os in darwin linux windows; do \ + tags="netgo"; \ if [[ $$os == "windows" ]]; then \ suffix=".exe"; \ + tags="netcgo"; \ else \ suffix=""; \ fi; \ @@ -80,7 +82,7 @@ gguf-parser: GOOS="$$os" GOARCH="$$arch" CGO_ENABLED=1 go build \ -trimpath \ -ldflags="-w -s -X main.Version=$(VERSION)" \ - -tags="netgo" \ + -tags="$$tags" \ -o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \ done; \ if [[ $$os == "darwin" ]]; then \ diff --git a/README.md b/README.md index 1047cd3..b4e5e09 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,7 @@ go get github.com/thxcode/gguf-parser-go ``` -You can also use the command-line package. - -```shell -go install github.com/thxcode/gguf-parser-go/cmd/gguf-parser - -``` +If you need one-shot command-line, try [gguf-parser](./cmd/gguf-parser) please. ## Examples diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index f3d8174..a205977 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -76,15 +76,15 @@ Usage of gguf-parser ...: ### Parse -#### parse local GGUF file +#### Parse local GGUF file ```shell $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf" -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| MODEL | jeffq | llama | 2 | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +| MODEL | jeffq | llama | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ | \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | @@ -98,23 +98,23 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra | TOKENIZER | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A | +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ -| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | true | 33 (32 + 1) | Yes | 88.39 MiB + 11.06 GiB = 8.68 GiB | 238.39 MiB | 11.06 GiB | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | true | 33 (32 + 1) | Yes | 88.39 MiB + 8.59 GiB = 8.68 GiB | 238.39 MiB | 11.06 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ ``` -#### parse remote GGUF file +#### Parse remote GGUF file ```shell $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" -+--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ -| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | -+--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ -| MODEL | emozilla | llama | 2 | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | -+--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ ++--------------+----------+-------+--------------+---------------+--------+------------+----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+----------+-------+--------------+---------------+--------+------------+----------+ +| MODEL | emozilla | llama | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | ++--------------+----------+-------+--------------+---------------+--------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ | \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | @@ -131,7 +131,7 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8 +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 24.84 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -140,11 +140,11 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8 ```shell $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-model-Q5_K_M.gguf" -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| MODEL | model | llama | 2 | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | -+--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +| MODEL | model | llama | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | ++--------------+-------+-------+----------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ | \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | @@ -161,7 +161,7 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode +--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ -| ESTIMATE | llama | 8192 | false | true | 33 (32 + 1) | Yes | 84.61 MiB + 6.49 GiB = 5.68 GiB | 234.61 MiB | 6.49 GiB | +| ESTIMATE | llama | 8192 | false | true | 33 (32 + 1) | Yes | 84.61 MiB + 5.59 GiB = 5.68 GiB | 234.61 MiB | 6.49 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ ``` @@ -170,11 +170,11 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode ```shell $ gguf-parser --ol-model="gemma2" -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| MODEL | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +| MODEL | gemma2 | gemma2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ | \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | @@ -191,15 +191,15 @@ $ gguf-parser --ol-model="gemma2" +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ -| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.97 MiB + 8.43 GiB = 7.05 GiB | 215.97 MiB | 8.43 GiB | +| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.97 MiB + 6.99 GiB = 7.05 GiB | 215.97 MiB | 8.43 GiB | +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ $ gguf-parser --ol-model="gemma2" --ol-crawl -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| MODEL | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | -+--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +| MODEL | gemma2 | gemma2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | ++--------------+--------+--------+--------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ | \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | @@ -216,11 +216,35 @@ $ gguf-parser --ol-model="gemma2" --ol-crawl +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ -| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.99 MiB + 8.43 GiB = 7.05 GiB | 215.99 MiB | 8.43 GiB | +| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.99 MiB + 6.99 GiB = 7.05 GiB | 215.99 MiB | 8.43 GiB | +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ ``` +#### Parse Clip model + +```shell +$ gguf-parser --hf-repo="xtuner/llava-llama-3-8b-v1_1-gguf" --hf-file="llava-llama-3-8b-v1_1-mmproj-f16.gguf" ++--------------+-----------------------------------+------+--------------+---------------+------------+------------+-----------+ +| \ | Name | Arch | Quantization | Little Endian | Size | Parameters | BPW | ++--------------+-----------------------------------+------+--------------+---------------+------------+------------+-----------+ +| MODEL | openai/clip-vit-large-patch14-336 | clip | F16 | true | 595.49 MiB | 311.89 M | 16.02 bpw | ++--------------+-----------------------------------+------+--------------+---------------+------------+------------+-----------+ + ++--------------+---------------+--------+------------------+---------+-----------------+ +| \ | Embedding Len | Layers | Feed Forward Len | Encoder | LLaVA Projector | ++--------------+---------------+--------+------------------+---------+-----------------+ +| ARCHITECTURE | 1024 | 23 | 4096 | Vision | mlp | ++--------------+---------------+--------+------------------+---------+-----------------+ + ++--------------+------+----------------+----------------+------------+ +| \ | Arch | Offload Layers | Full Offloaded | (V)RAM | ++--------------+------+----------------+----------------+------------+ +| ESTIMATE | clip | 24 | Yes | 595.49 MiB | ++--------------+------+----------------+----------------+------------+ + +``` + ### Estimate #### Estimate with full layers offload (default) @@ -230,7 +254,7 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 24.84 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -239,11 +263,11 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=0 -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ -| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 0 B = 25.09 GiB | 25.24 GiB | 2.46 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------+------------+-------------+ ``` @@ -251,11 +275,11 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=10 -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ -| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 10 | No | 17.36 GiB + 10.19 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | -+--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 4096 | false | false | 33 (32 + 1) | Yes | 189.24 MiB + 21.34 GiB = 21.53 GiB | 339.24 MiB | 21.89 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -266,7 +290,7 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ -| ESTIMATE | llama | 4096 | false | false | 33 (32 + 1) | Yes | 189.24 MiB + 21.89 GiB = 21.53 GiB | 339.24 MiB | 21.89 GiB | +| ESTIMATE | llama | 4096 | false | false | 33 (32 + 1) | Yes | 189.24 MiB + 21.34 GiB = 21.53 GiB | 339.24 MiB | 21.89 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -278,7 +302,7 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | true | false | 33 (32 + 1) | Yes | 245.24 MiB + 25.33 GiB = 25.08 GiB | 395.24 MiB | 25.33 GiB | +| ESTIMATE | llama | 32768 | true | false | 33 (32 + 1) | Yes | 245.24 MiB + 24.84 GiB = 25.08 GiB | 395.24 MiB | 25.33 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -290,7 +314,7 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- +--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | +| ESTIMATE | llama | 32768 | false | false | 10 | No | 17.36 GiB + 7.73 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ``` @@ -302,21 +326,21 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ | \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ -| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | +| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 0 B = 25.09 GiB | 25.24 GiB | 2.46 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 5 | | 21.23 GiB + 6.33 GiB = 25.09 GiB | 21.37 GiB | 6.33 GiB | +| | | | | | 5 | | 21.23 GiB + 3.86 GiB = 25.09 GiB | 21.37 GiB | 6.33 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 10 | | 17.36 GiB + 10.19 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | +| | | | | | 10 | | 17.36 GiB + 7.73 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 15 | | 13.50 GiB + 14.06 GiB = 25.09 GiB | 13.64 GiB | 14.06 GiB | +| | | | | | 15 | | 13.50 GiB + 11.59 GiB = 25.09 GiB | 13.64 GiB | 14.06 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 20 | | 9.63 GiB + 17.92 GiB = 25.09 GiB | 9.78 GiB | 17.92 GiB | +| | | | | | 20 | | 9.63 GiB + 15.46 GiB = 25.09 GiB | 9.78 GiB | 17.92 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 25 | | 5.77 GiB + 21.79 GiB = 25.09 GiB | 5.91 GiB | 21.79 GiB | +| | | | | | 25 | | 5.77 GiB + 19.32 GiB = 25.09 GiB | 5.91 GiB | 21.79 GiB | + + + + + +----------------+ +------------------------------------+------------+-------------+ -| | | | | | 30 | | 1.90 GiB + 25.65 GiB = 25.09 GiB | 2.05 GiB | 25.65 GiB | +| | | | | | 30 | | 1.90 GiB + 23.19 GiB = 25.09 GiB | 2.05 GiB | 25.65 GiB | + + + + + +----------------+----------------+------------------------------------+------------+-------------+ -| | | | | | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +| | | | | | 33 (32 + 1) | Yes | 245.24 MiB + 24.84 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 34dba80..eaaaa74 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -11,9 +11,11 @@ import ( "github.com/olekukonko/tablewriter" + "github.com/thxcode/gguf-parser-go/util/anyx" "github.com/thxcode/gguf-parser-go/util/json" . "github.com/thxcode/gguf-parser-go" + "regexp" ) var Version = "v0.0.0" @@ -31,6 +33,7 @@ func main() { hfFile string olModel string olCrawl bool + olUsage bool // read options debug bool skipProxy bool @@ -49,6 +52,7 @@ func main() { offloadLayersStep uint64 // output options version bool + raw bool skipModel bool skipArchitecture bool skipTokenizer bool @@ -69,18 +73,22 @@ func main() { "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF"+ "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. "+ "Note that gguf-parser does not need to download the entire GGUF file.") - fs.StringVar(&hfRepo, "repo", hfRepo, "Repository of HuggingFace which the GGUF file store, e.g. "+ - "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file. [Deprecated, use --hf-repo instead]") - fs.StringVar(&hfFile, "file", hfFile, "Model file below the --repo, e.g. "+ - "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. [Deprecated, use --hf-file instead]") // Deprecated. + fs.StringVar(&hfRepo, "repo", hfRepo, "[DEPRECATED, use --hf-repo instead] "+ // Deprecated, remove when release v0.3.0. + "Repository of HuggingFace which the GGUF file store, e.g. "+ + "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file.") + fs.StringVar(&hfFile, "file", hfFile, "[DEPRECATED, use --hf-file instead] "+ // Deprecated, remove when release v0.3.0. + "Model file below the --repo, e.g. "+ + "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.") fs.StringVar(&hfRepo, "hf-repo", hfRepo, "Repository of HuggingFace which the GGUF file store, e.g. "+ - "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.") // Deprecated. + "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.") fs.StringVar(&hfFile, "hf-file", hfFile, "Model file below the --repo, e.g. "+ "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.") fs.StringVar(&olModel, "ol-model", olModel, "Model name of Ollama, e.g. "+ "gemma2.") fs.BoolVar(&olCrawl, "ol-crawl", olCrawl, "Crawl the Ollama model instead of blobs fetching, "+ "which will be more efficient and faster, but lossy.") + fs.BoolVar(&olUsage, "ol-usage", olUsage, "Specify respecting the extending layers introduced by Ollama, "+ + "which affects the usage estimation.") fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.") fs.BoolVar(&skipProxy, "skip-proxy", skipProxy, "Skip proxy settings, "+ "works with --url/--hf-*/--ol-*, "+ @@ -119,17 +127,20 @@ func main() { fs.BoolVar(&noMMap, "no-mmap", noMMap, "Specify disabling Memory-Mapped using, "+ "which is used to estimate the usage. "+ "Memory-Mapped can avoid loading the entire model weights into RAM.") - fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, "+ + fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "[DEPRECATED, use --gpu-layers instead] "+ // Deprecated, remove when release v0.3.0. + "Specify how many layers to offload, "+ "which is used to estimate the usage, "+ - "default is full offloaded. [Deprecated, use --gpu-layers instead]") // Deprecated. + "default is full offloaded.") fs.IntVar(&offloadLayers, "gpu-layers", offloadLayers, "Specify how many layers to offload, "+ "which is used to estimate the usage, "+ "default is full offloaded.") - fs.Uint64Var(&offloadLayersStep, "offload-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ - "works with --offload-layers. [Deprecated, use --gpu-layers-step instead]") // Deprecated. + fs.Uint64Var(&offloadLayersStep, "offload-layers-step", offloadLayersStep, "[DEPRECATED, use --gpu-layers-step instead] "+ // Deprecated, remove when release v0.3.0. + "Specify the step of layers to offload, "+ + "works with --offload-layers.") fs.Uint64Var(&offloadLayersStep, "gpu-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ "works with --gpu-layers.") fs.BoolVar(&version, "version", version, "Show gguf-parser version.") + fs.BoolVar(&raw, "raw", raw, "Output the file only, skip anything.") fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip to display model metadata.") fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip to display architecture metadata.") fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata") @@ -226,7 +237,36 @@ func main() { case hfRepo != "" && hfFile != "": gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...) case olModel != "": - gf, err = ParseGGUFFileFromOllama(ctx, olModel, olCrawl, ropts...) + om := ParseOllamaModel(olModel) + gf, err = ParseGGUFFileFromOllamaModel(ctx, om, olCrawl, ropts...) + if om != nil && olUsage { + // Parameters override. + { + ps, _ := om.Params(ctx, nil) + if v, ok := ps["num_ctx"]; ok { + eopts = append(eopts, WithContextSize(anyx.Number[int32](v))) + } else if ctxSize <= 0 { + eopts = append(eopts, WithContextSize(2048)) + } + if v, ok := ps["use_mmap"]; ok && !anyx.Bool(v) { + noMMap = true + } + if v, ok := ps["num_gpu"]; ok { + offloadLayers = anyx.Number[int](v) + } + } + // Projector overlap, + // in here, we just assume the projector is overlapped with its size to VRAM. + { + var sz uint64 + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.projector$`)) + for i := range mls { + sz += mls[i].Size + } + eopts = append(eopts, WithClipUsage(sz)) + } + + } } if err != nil { _, _ = fmt.Fprintf(os.Stderr, "failed to parse GGUF file: %s\n", err.Error()) @@ -234,6 +274,22 @@ func main() { } } + // Output raw. + + if raw { + enc := json.NewEncoder(os.Stdout) + if inPrettyJson { + enc.SetIndent("", " ") + } + if err := enc.Encode(gf); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to encode JSON: %s\n", err.Error()) + os.Exit(1) + } + return + } + + // Otherwise, display the metadata and estimate the usage. + var ( m GGUFModelMetadata a GGUFArchitectureMetadata @@ -258,7 +314,8 @@ func main() { e = gf.EstimateLLaMACppUsage(eopts...) } - // Output + // Then, output as JSON or table. + var ( mmap = !noMMap platformRAM, platformVRAM uint64 @@ -285,36 +342,38 @@ func main() { if !skipArchitecture { o["architecture"] = a } - if !skipTokenizer { + if !skipTokenizer && t.Model != "" { o["tokenizer"] = t } if !skipEstimate { es := e.Summarize(mmap, platformRAM, platformVRAM) - switch { - case offloadLayersStep > e.OffloadLayers: - offloadLayersStep = e.OffloadLayers - case offloadLayersStep <= 0: - offloadLayersStep = e.OffloadLayers - } - if offloadLayersStep < e.OffloadLayers { - cnt := e.OffloadLayers/offloadLayersStep + 1 - if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { - cnt++ + if e.Architecture != "clip" { + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers } - ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) - var wg sync.WaitGroup - for i := 0; i < cap(ess); i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - eopts := eopts[:len(eopts):len(eopts)] - eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) - }(i) + if offloadLayersStep < e.OffloadLayers { + cnt := e.OffloadLayers/offloadLayersStep + 1 + if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { + cnt++ + } + ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) + }(i) + } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess } - wg.Wait() - ess[cap(ess)-1] = es.Memory[0] - es.Memory = ess } o["estimate"] = es } @@ -336,12 +395,11 @@ func main() { if !skipModel { tprint( "MODEL", - []string{"Name", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, + []string{"Name", "Arch", "Quantization", "Little Endian", "Size", "Parameters", "BPW"}, nil, []string{ m.Name, m.Architecture, - sprintf(m.QuantizationVersion), sprintf(m.FileType), sprintf(m.LittleEndian), sprintf(m.Size), @@ -351,11 +409,13 @@ func main() { } if !skipArchitecture { - tprint( - "ARCHITECTURE", - []string{"Max Context Len", "Embedding Len", "Embedding GQA", "Attention Head Cnt", "Layers", "Feed Forward Len", "Expert Cnt", "Vocabulary Len"}, - nil, - []string{ + var ( + hd []string + bd []string + ) + if a.Architecture != "clip" { + hd = []string{"Max Context Len", "Embedding Len", "Embedding GQA", "Attention Head Cnt", "Layers", "Feed Forward Len", "Expert Cnt", "Vocabulary Len"} + bd = []string{ sprintf(a.MaximumContextLength), sprintf(a.EmbeddingLength), sprintf(a.EmbeddingGQA), @@ -364,10 +424,25 @@ func main() { sprintf(a.FeedForwardLength), sprintf(a.ExpertCount), sprintf(a.VocabularyLength), - }) + } + } else { + hd = []string{"Embedding Len", "Layers", "Feed Forward Len", "Encoder", "LLaVA Projector"} + bd = []string{ + sprintf(a.EmbeddingLength), + sprintf(a.BlockCount), + sprintf(a.FeedForwardLength), + sprintf(tenary(a.ClipHasTextEncoder, tenary(a.ClipHasVisionEncoder, "Text & Vision", "Text"), tenary(a.ClipHasVisionEncoder, "Vision", "N/A"))), + sprintf(tenary(a.ClipHasLLaVaProjector, a.ClipProjectorType, "N/A")), + } + } + tprint( + "ARCHITECTURE", + hd, + nil, + bd) } - if !skipTokenizer { + if !skipTokenizer && t.Model != "" { tprint( "TOKENIZER", []string{"Model", "Tokens Size", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, @@ -386,85 +461,84 @@ func main() { } if !skipEstimate { + var ( + hd []string + mg []int + bds [][]string + ) es := e.Summarize(mmap, platformRAM, platformVRAM) - switch { - case offloadLayersStep > e.OffloadLayers: - offloadLayersStep = e.OffloadLayers - case offloadLayersStep <= 0: - offloadLayersStep = e.OffloadLayers - } - if offloadLayersStep < e.OffloadLayers { - cnt := e.OffloadLayers/offloadLayersStep + 1 - if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { - cnt++ + if e.Architecture != "clip" { + hd = []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA (RAM + VRAM)", "NonUMA RAM", "NonUMA VRAM"} + mg = []int{0, 1, 2, 3, 5} + + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers } - ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) - var wg sync.WaitGroup - for i := 0; i < cap(ess); i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - eopts := eopts[:len(eopts):len(eopts)] - eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) - }(i) + if offloadLayersStep < e.OffloadLayers { + cnt := e.OffloadLayers/offloadLayersStep + 1 + if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { + cnt++ + } + ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) + }(i) + } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess } - wg.Wait() - ess[cap(ess)-1] = es.Memory[0] - es.Memory = ess - } - bd := make([][]string, len(es.Memory)) - for i := range es.Memory { - bd[i] = []string{ - sprintf(es.Architecture), - sprintf(es.ContextSize), - sprintf(es.FlashAttention), - sprintf(!es.NoMMap), - sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), - sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), - sprintf("%s + %s = %s", es.Memory[i].UMA.RAM, es.Memory[i].NonUMA.VRAM, es.Memory[i].UMA.RAM+es.Memory[i].UMA.VRAM), - sprintf(es.Memory[i].NonUMA.RAM), - sprintf(es.Memory[i].NonUMA.VRAM), + + bds = make([][]string, len(es.Memory)) + for i := range es.Memory { + bds[i] = []string{ + sprintf(es.Architecture), + sprintf(es.ContextSize), + sprintf(es.FlashAttention), + sprintf(!es.NoMMap), + sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), + sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), + sprintf("%s + %s = %s", es.Memory[i].UMA.RAM, es.Memory[i].UMA.VRAM, es.Memory[i].UMA.RAM+es.Memory[i].UMA.VRAM), + sprintf(es.Memory[i].NonUMA.RAM), + sprintf(es.Memory[i].NonUMA.VRAM), + } + } + } else { + hd = []string{"Arch", "Offload Layers", "Full Offloaded", "(V)RAM"} + bds = [][]string{ + { + sprintf(es.Architecture), + sprintf(es.Memory[0].OffloadLayers), + sprintf(tenary(es.Memory[0].FullOffloaded, "Yes", "No")), + sprintf(max(es.Memory[0].UMA.RAM, es.Memory[0].UMA.VRAM)), + }, } } tprint( "ESTIMATE", - []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA (RAM + VRAM)", "NonUMA RAM", "NonUMA VRAM"}, - []int{0, 1, 2, 3, 5}, - bd...) + hd, + mg, + bds...) } } func sprintf(f any, a ...any) string { - switch v := f.(type) { - case string: + if v, ok := f.(string); ok { if len(a) != 0 { return fmt.Sprintf(v, a...) } return v - case []byte: - return string(v) - case int: - return strconv.Itoa(v) - case int32: - return strconv.Itoa(int(v)) - case int64: - return strconv.Itoa(int(v)) - case uint: - return strconv.Itoa(int(v)) - case uint32: - return strconv.Itoa(int(v)) - case uint64: - return strconv.Itoa(int(v)) - case float32: - return strconv.FormatFloat(float64(v), 'f', -1, 32) - case float64: - return strconv.FormatFloat(v, 'f', -1, 64) - case bool: - return strconv.FormatBool(v) - default: - return fmt.Sprintf("%v", v) } + return anyx.String(f) } func tprint(title string, header []string, merges []int, body ...[]string) { diff --git a/file.go b/file.go index 8425c2d..6f8841d 100644 --- a/file.go +++ b/file.go @@ -410,11 +410,14 @@ func (gf *GGUFFile) layers() GGUFLayerTensorInfos { pm := make(map[string]any) for i := range gf.TensorInfos { ps := strings.Split(gf.TensorInfos[i].Name, ".") + if len(ps) < 2 { + ret = append(ret, gf.TensorInfos[i]) + continue + } switch { default: ret = append(ret, gf.TensorInfos[i]) - continue - case len(ps) >= 2 && ps[0] == "blk": + case ps[0] == "blk" || ps[0] == "mm": p := strings.Join([]string{ps[0], ps[1]}, ".") if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} @@ -423,7 +426,27 @@ func (gf *GGUFFile) layers() GGUFLayerTensorInfos { } l := pm[p].(*GGUFNamedTensorInfos) l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, gf.TensorInfos[i]) - case len(ps) >= 3 && (ps[0] == "decoder" || ps[0] == "encoder"): + case ps[0] == "v" || ps[0] == "t": // Clip. + p := ps[0] + if _, ok := pm[p]; !ok { + xl := &GGUFNamedTensorInfos{Name: p} + pm[p] = xl + ret = append(ret, xl) + } + xl := pm[p].(*GGUFNamedTensorInfos) + if ps[1] != "blk" || len(ps) < 3 { + xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, gf.TensorInfos[i]) + continue + } + p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".") + if _, ok := pm[p]; !ok { + l := &GGUFNamedTensorInfos{Name: p} + pm[p] = l + xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, l) + } + l := pm[p].(*GGUFNamedTensorInfos) + l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, gf.TensorInfos[i]) + case ps[0] == "decoder" || ps[0] == "encoder": // BERT. p := ps[0] if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} @@ -431,7 +454,7 @@ func (gf *GGUFFile) layers() GGUFLayerTensorInfos { ret = append(ret, xl) } xl := pm[p].(*GGUFNamedTensorInfos) - if ps[1] != "block" { + if ps[1] != "block" || len(ps) < 3 { xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, gf.TensorInfos[i]) continue } diff --git a/file_architecture.go b/file_architecture.go index 513787a..a0f7de3 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -45,7 +45,7 @@ type GGUFArchitectureMetadata struct { AttentionClampKQV float32 `json:"attentionClampKQV,omitempty"` // AttentionLayerNormEpsilon is the epsilon value used in the LayerNorm(Layer Normalization). AttentionLayerNormEpsilon float32 `json:"attentionLayerNormEpsilon,omitempty"` - // AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(Root Mean Square Layer Normalization), + // AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization), // which is a simplification of the original LayerNorm. AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"` // AttentionKeyLength(n_embd_head_k) is the size of a key head. @@ -91,6 +91,23 @@ type GGUFArchitectureMetadata struct { EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"` // EmbeddingGGQA is the GQA of the embedding layer. EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"` + + // ClipHasTextEncoder indicates whether the clip model has text encoder or not. + // + // Only used when Architecture is "clip". + ClipHasTextEncoder bool `json:"clipHasTextEncoder,omitempty"` + // ClipHasVisionEncoder indicates whether the clip model has vision encoder or not. + // + // Only used when Architecture is "clip". + ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"` + // ClipHasLLaVaProjector indicates whether the clip model has LLaVa projector or not. + // + // Only used when Architecture is "clip". + ClipHasLLaVaProjector bool `json:"clipHasLLaVaProjector,omitempty"` + // ClipProjectorType is the type of the projector used in the clip model. + // + // Only used when Architecture is "clip". + ClipProjectorType string `json:"clipProjectorType,omitempty"` } // Architecture returns the architecture metadata of the GGUF file. @@ -99,6 +116,120 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { if v, ok := gf.Header.MetadataKV.Get("general.architecture"); ok { arch = v.ValueString() } + + if arch == "clip" { + return gf.clipArchitecture() + } + return gf.transformArchitecture(arch) +} + +func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) { + var ( + hasTextEncoderKey = "clip.has_text_encoder" + hasVisionEncoderKey = "clip.has_vision_encoder" + hasLLaVaProjectorKey = "clip.has_llava_projector" + projectorTypeKey = "clip.projector_type" + + textEmbeddingLengthKey = "clip.text.embedding_length" + textBlockCountKey = "clip.text.block_count" + textFeedForwardLengthKey = "clip.text.feed_forward_length" + textAttentionHeadCountKey = "clip.text.attention.head_count" + textAttentionLayerNormRMSEpsilonKey = "clip.text.attention.layer_norm_epsilon" + + visionEmbeddingLengthKey = "clip.vision.embedding_length" + visionBlockCountKey = "clip.vision.block_count" + visionFeedForwardLengthKey = "clip.vision.feed_forward_length" + visionAttentionHeadCountKey = "clip.vision.attention.head_count" + visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon" + ) + + ga.Architecture = "clip" + + m, _ := gf.Header.MetadataKV.Index([]string{ + hasTextEncoderKey, + hasVisionEncoderKey, + hasLLaVaProjectorKey, + projectorTypeKey, + textEmbeddingLengthKey, + textBlockCountKey, + textFeedForwardLengthKey, + textAttentionHeadCountKey, + textAttentionLayerNormRMSEpsilonKey, + visionEmbeddingLengthKey, + visionBlockCountKey, + visionFeedForwardLengthKey, + visionAttentionHeadCountKey, + visionAttentionLayerNormRMSEpsilonKey, + }) + + if v, ok := m[hasTextEncoderKey]; ok { + ga.ClipHasTextEncoder = v.ValueBool() + } + if v, ok := m[hasVisionEncoderKey]; ok { + ga.ClipHasVisionEncoder = v.ValueBool() + } + if v, ok := m[hasLLaVaProjectorKey]; ok { + ga.ClipHasLLaVaProjector = v.ValueBool() + } + if v, ok := m[projectorTypeKey]; ok { + ga.ClipProjectorType = v.ValueString() + } else { + ga.ClipProjectorType = "mlp" + } + + if v, ok := m[textEmbeddingLengthKey]; ok { + ga.EmbeddingLength = ValueNumeric[uint64](v) + } + if v, ok := m[textBlockCountKey]; ok { + ga.BlockCount = ValueNumeric[uint64](v) + } + if v, ok := m[textFeedForwardLengthKey]; ok { + ga.FeedForwardLength = ValueNumeric[uint64](v) + } + if v, ok := m[textAttentionHeadCountKey]; ok { + ga.AttentionHeadCount = ValueNumeric[uint64](v) + } + if v, ok := m[textAttentionLayerNormRMSEpsilonKey]; ok { + ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) + } + + if v, ok := m[visionEmbeddingLengthKey]; ok { + ga.EmbeddingLength = ValueNumeric[uint64](v) + } + if v, ok := m[visionBlockCountKey]; ok { + ga.BlockCount = ValueNumeric[uint64](v) + } + if v, ok := m[visionFeedForwardLengthKey]; ok { + ga.FeedForwardLength = ValueNumeric[uint64](v) + } + if v, ok := m[visionAttentionHeadCountKey]; ok { + ga.AttentionHeadCount = ValueNumeric[uint64](v) + } + if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok { + ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) + } + + ga.AttentionHeadCountKV = ga.AttentionHeadCount + + { + if ga.AttentionHeadCountKV > 0 { + ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV + } + if ga.AttentionHeadCount > 0 { + ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV + ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV + } + if ga.Architecture == "mamba" { + ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize) + ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize) + } + ga.EmbeddingGQA = ga.EmbeddingValueGQA + } + + return ga +} + +func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetadata) { var ( contextLengthKey = arch + ".context_length" embeddingLengthKey = arch + ".embedding_length" diff --git a/file_estimate.go b/file_estimate.go index c211986..453afa6 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -42,6 +42,8 @@ type ( KVCache LLaMACppKVCacheUsage `json:"kvCache"` // Computation is the memory usage of computation. Computation LLaMACppComputationUsage `json:"computation"` + // Clipper is the memory usage of clipper. + Clipper GGUFBytesScalar `json:"clipper"` } // LLaMACppWeightUsage represents the memory usage of loading weights in llama.cpp. @@ -170,6 +172,11 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( isOffloadOutputLayer bool ) { + // For clip, + // see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008. + if a.Architecture == "clip" { + o.OffloadLayers = ptr.To(a.BlockCount + 1) // Clip means full offload. + } if v := o.OffloadLayers; v == nil { o.OffloadLayers = ptr.To(a.BlockCount) nOffloadLayers = a.BlockCount @@ -221,12 +228,17 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // Weight. { // Compute. - for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ { - switch { - case i < nLoadLayers: - e.Load.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) - case i >= offloadStart: - e.Offload.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + switch a.Architecture { + case "clip": + e.Offload.Weight.Compute = GGUFBytesScalar(ls.Bytes()) + default: + for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ { + switch { + case i < nLoadLayers: + e.Load.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + case i >= offloadStart: + e.Offload.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) + } } } @@ -290,10 +302,13 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nKV}) // F32 [1, n_kv] inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nKV, nBatch}) // I32 [n_kv, n_batch] ) - if a.Architecture == "mamba" { + switch a.Architecture { + case "clip": + // NOP. + case "mamba": e.Load.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpSMask + inpSSeq + inpOutIds) e.Offload.Computation.Input = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq + inpOutIds) - } else { + default: e.Load.Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) e.Offload.Computation.Input = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) } @@ -301,7 +316,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. - if a.Architecture == "mamba" { + switch a.Architecture { + case "clip": + // NOP. + case "mamba": convInc := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingKeyGQA, nKV}) // F32 [n_embd_key_gqa, n_kv] reshape for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { @@ -325,7 +343,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( ssmInc += rs } e.Offload.Computation.Compute = GGUFBytesScalar(convInc + ssmInc) - } else { + default: loadAttnInc, offloadAttnInc := uint64(0), uint64(0) if o.FlashAttention { // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387. @@ -389,7 +407,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } } // Finally, get the usage of output layer. - { + switch a.Architecture { + case "clip": + // NOP. + default: outInc := inpEmbd if a.Architecture == "mamba" { outInc += inpSMask + inpSSeq @@ -404,6 +425,11 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( outInc += uint64(e.Load.Weight.Output) e.Offload.Computation.Output = GGUFBytesScalar(outInc) } + + // Clipper. + if o.ClipUsage != nil { + e.Offload.Clipper = GGUFBytesScalar(*o.ClipUsage) + } } return e @@ -457,7 +483,7 @@ type ( // SummarizeMemory returns the summary of the estimated memory usage of loading the GGUF file in llama.cpp, // the input options are used to adjust the summary. -func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, ramFootprint, vramFootprint uint64) (ems LLaMACppUsageEstimateMemorySummary) { +func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (ems LLaMACppUsageEstimateMemorySummary) { ems.OffloadLayers, ems.FullOffloaded = e.OffloadLayers, e.FullOffloaded if ems.FullOffloaded { ems.OffloadLayers++ // The output layer is offloaded. @@ -479,13 +505,14 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, ramFootprint, vramFoot wg = e.Offload.Weight.Sum() kv = e.Offload.KVCache.Sum() cp = 0 - ems.UMA.VRAM = fp + wg + kv + cp + cl := e.Offload.Clipper + ems.UMA.VRAM = fp + wg + kv + cp + cl } // NonUMA. { // RAM. - fp := GGUFBytesScalar(ramFootprint) + e.Load.Footprint + fp := GGUFBytesScalar(nonUMARamFootprint) + e.Load.Footprint wg := e.Load.Weight.Sum() kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() @@ -497,11 +524,12 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, ramFootprint, vramFoot } } // VRAM. - fp = GGUFBytesScalar(vramFootprint) + e.Offload.Footprint + fp = GGUFBytesScalar(nonUMAVramFootprint) + e.Offload.Footprint wg = e.Offload.Weight.Sum() kv = e.Offload.KVCache.Sum() cp = e.Offload.Computation.Sum() - ems.NonUMA.VRAM = fp + wg + kv + cp + cl := e.Offload.Clipper + ems.NonUMA.VRAM = fp + wg + kv + cp + cl } return ems @@ -509,10 +537,10 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, ramFootprint, vramFoot // Summarize returns the summary of the estimated result of loading the GGUF file in llama.cpp, // the input options are used to adjust the summary. -func (e LLaMACppUsageEstimate) Summarize(mmap bool, ramFootprint, vramFootprint uint64) (es LLaMACppUsageEstimateSummary) { +func (e LLaMACppUsageEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es LLaMACppUsageEstimateSummary) { // Summarize memory. es.Memory = []LLaMACppUsageEstimateMemorySummary{ - e.SummarizeMemory(mmap, ramFootprint, vramFootprint), + e.SummarizeMemory(mmap, nonUMARamFootprint, nonUMAVramFootprint), } // Just copy from the original estimate. diff --git a/file_estimate_option.go b/file_estimate_option.go index f82d09c..a70274e 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -18,6 +18,7 @@ type ( OffloadKVCache *bool OffloadLayers *uint64 FlashAttention bool + ClipUsage *uint64 } LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions) ) @@ -118,3 +119,11 @@ func WithFlashAttention() LLaMACppUsageEstimateOption { o.FlashAttention = true } } + +// WithClipUsage sets the clip usage for the estimate, +// which affects the usage of VRAM. +func WithClipUsage(clip uint64) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + o.ClipUsage = &clip + } +} diff --git a/file_from_metadata.go b/file_from_distro.go similarity index 76% rename from file_from_metadata.go rename to file_from_distro.go index edef6e9..c15f907 100644 --- a/file_from_metadata.go +++ b/file_from_distro.go @@ -11,7 +11,6 @@ import ( "time" "golang.org/x/exp/maps" - "golang.org/x/net/html" "github.com/thxcode/gguf-parser-go/util/funcx" "github.com/thxcode/gguf-parser-go/util/httpx" @@ -32,16 +31,26 @@ var ( // which will be more efficient and faster, but lossy. // If the crawling fails, it will fall back to the default behavior. func ParseGGUFFileFromOllama(ctx context.Context, model string, crawl bool, opts ...GGUFReadOption) (*GGUFFile, error) { + return ParseGGUFFileFromOllamaModel(ctx, ParseOllamaModel(model), crawl, opts...) +} + +// ParseGGUFFileFromOllamaModel is similar to ParseGGUFFileFromOllama, +// but inputs an OllamaModel instead of a string. +// +// The given OllamaModel will be completed(fetching MediaType, Config and Layers) after calling this function. +// If the crawl is true, it will try to crawl the metadata from Ollama website instead of blobs fetching, +// which will be more efficient and faster, but lossy. +// If the crawling fails, it will fall back to the default behavior. +func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, crawl bool, opts ...GGUFReadOption) (*GGUFFile, error) { + if model == nil { + return nil, ErrOllamaInvalidModel + } + var o _GGUFReadOptions for _, opt := range opts { opt(&o) } - om := ParseOllamaModel(model) - if om == nil { - return nil, ErrOllamaInvalidModel - } - cli := httpx.Client( httpx.ClientOptions(). WithUserAgent("gguf-parser-go"). @@ -70,70 +79,31 @@ func ParseGGUFFileFromOllama(ctx context.Context, model string, crawl bool, opts var ml OllamaModelLayer { - err := om.Complete(ctx, cli) + err := model.Complete(ctx, cli) if err != nil { return nil, fmt.Errorf("complete ollama model: %w", err) } var ok bool - ml, ok = om.GetLayer("application/vnd.ollama.image.model") + ml, ok = model.GetLayer("application/vnd.ollama.image.model") if !ok { return nil, ErrOllamaBaseLayerNotFound } } if crawl { - mwu, lwu := om.WebURL().String(), ml.WebURL().String() - req, err := httpx.NewGetRequestWithContext(ctx, lwu) - if err != nil { - return nil, fmt.Errorf("new request: %w", err) - } - req.Header.Add("Referer", mwu) - req.Header.Add("Hx-Current-Url", mwu) - req.Header.Add("Hx-Request", "true") - req.Header.Add("Hx-Target", "file-explorer") - - var n *html.Node - err = httpx.Do(cli, req, func(resp *http.Response) error { - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("status code %d", resp.StatusCode) - } - n, err = html.Parse(resp.Body) - if err != nil { - return fmt.Errorf("parse html: %w", err) - } - return nil - }) + r, err := ml.FetchWebPage(ctx, cli) if err == nil { - var wk func(*html.Node) string - wk = func(n *html.Node) string { - if n.Type == html.ElementNode && n.Data == "div" { - for i := range n.Attr { - if n.Attr[i].Key == "class" && n.Attr[i].Val == "whitespace-pre-wrap" { - return n.FirstChild.Data - } - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - if r := wk(c); r != "" { - return r - } - } - return "" - } - - if r := wk(n); r != "" { - gf, err := parseGGUFFileFromMetadata("ollama", r, ml.Size) - if err == nil { - return gf, nil - } + gf, err := parseGGUFFileFromDistroMetadata("ollama", r, ml.Size) + if err == nil { + return gf, nil } } // Fallback to the default behavior. } - return parseGGUFFileFromRemote(ctx, cli, ml.URL().String(), o) + return parseGGUFFileFromRemote(ctx, cli, ml.BlobURL().String(), o) } type _OllamaMetadata struct { @@ -148,7 +118,7 @@ type _OllamaMetadata struct { Version uint32 `json:"version"` } -func parseGGUFFileFromMetadata(source, data string, size uint64) (*GGUFFile, error) { +func parseGGUFFileFromDistroMetadata(source, data string, size uint64) (*GGUFFile, error) { if source != "ollama" { return nil, fmt.Errorf("invalid source %q", source) } @@ -174,7 +144,7 @@ func parseGGUFFileFromMetadata(source, data string, size uint64) (*GGUFFile, err gf.Header.Magic = GGUFMagicGGUFLe gf.Header.Version = GGUFVersion(m.Version) gf.Header.TensorCount = uint64(len(m.Tensors)) - gf.Header.MetadataKVCount = uint64(len(m.Metadata) + 1 /* tokenizer.chat_template */) + gf.Header.MetadataKVCount = uint64(1 /* tokenizer.chat_template */ + len(m.Metadata)) gf.Size = GGUFBytesScalar(size) gf.ModelParameters = GGUFParametersScalar(m.NumParams) @@ -223,6 +193,24 @@ func parseGGUFFileFromMetadata(source, data string, size uint64) (*GGUFFile, err } v = av } + case []any: + vt = GGUFMetadataValueTypeArray + av := GGUFMetadataKVArrayValue{ + Type: GGUFMetadataValueTypeString, + Len: uint64(len(vv)), + } + if av.Len > 0 { + av.Array = vv + switch vv[0].(type) { + case bool: + av.Type = GGUFMetadataValueTypeBool + case float64: + av.Type = GGUFMetadataValueTypeFloat32 + case int64: + av.Type = GGUFMetadataValueTypeUint32 + } + } + v = av } gf.Header.MetadataKV = append(gf.Header.MetadataKV, GGUFMetadataKV{ Key: k, diff --git a/file_from_remote.go b/file_from_remote.go index 75d4226..fed6c06 100644 --- a/file_from_remote.go +++ b/file_from_remote.go @@ -17,7 +17,7 @@ func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts . return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://huggingface.co/%s/resolve/main/%s", repo, file), opts...) } -// ParseGGUFFileRemote parses a GGUF file from a remote URL, +// ParseGGUFFileRemote parses a GGUF file from a remote BlobURL, // and returns a GGUFFile, or an error if any. func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (*GGUFFile, error) { var o _GGUFReadOptions diff --git a/file_model.go b/file_model.go index 8d90382..485d4a6 100644 --- a/file_model.go +++ b/file_model.go @@ -132,6 +132,8 @@ func (gf *GGUFFile) Model() (gm GGUFModelMetadata) { if v, ok := m[architectureKey]; ok { gm.Architecture = v.ValueString() + } else { + gm.Architecture = "llama" } if v, ok := m[quantizationKey]; ok { gm.QuantizationVersion = ValueNumeric[uint32](v) diff --git a/go.mod b/go.mod index 78047d8..3a6ade9 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/stretchr/testify v1.9.0 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 golang.org/x/net v0.25.0 + golang.org/x/sync v0.7.0 golang.org/x/sys v0.20.0 golang.org/x/tools v0.21.0 ) @@ -20,6 +21,5 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect golang.org/x/mod v0.17.0 // indirect - golang.org/x/sync v0.7.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/ollama_model.go b/ollama_model.go index 8475b9e..dfd25ae 100644 --- a/ollama_model.go +++ b/ollama_model.go @@ -8,6 +8,9 @@ import ( "regexp" "strings" + "golang.org/x/net/html" + "golang.org/x/sync/errgroup" + "github.com/thxcode/gguf-parser-go/util/httpx" "github.com/thxcode/gguf-parser-go/util/json" "github.com/thxcode/gguf-parser-go/util/stringx" @@ -24,6 +27,8 @@ const ( ) type ( + // OllamaModel represents an Ollama model, + // its manifest(including MediaType, Config and Layers) can be completed further by calling the Complete method. OllamaModel struct { Schema string `json:"schema"` Registry string `json:"registry"` @@ -34,13 +39,32 @@ type ( MediaType string `json:"mediaType"` Config OllamaModelLayer `json:"config"` Layers []OllamaModelLayer `json:"layers"` + + // Client is the http client used to complete the OllamaModel's network operations. + // + // When this field is nil, + // it will be set to the client used by OllamaModel.Complete. + // + // When this field is offered, + // the network operations will be done with this client. + Client *http.Client `json:"-"` } + + // OllamaModelLayer represents an Ollama model layer, + // its digest can be used to download the artifact. OllamaModelLayer struct { MediaType string `json:"mediaType"` Size uint64 `json:"size"` Digest string `json:"digest"` - model *OllamaModel + // Root points to the root OllamaModel, + // which is never serialized or deserialized. + // + // When called OllamaModel.Complete, + // this field will be set to the OllamaModel itself. + // If not, this field will be nil, + // and must be set manually to the root OllamaModel before calling the method of OllamaModelLayer. + Root *OllamaModel `json:"-"` } ) @@ -142,73 +166,324 @@ func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModel return ls } -// URL returns the URL of the OllamaModel. -func (om *OllamaModel) URL() *url.URL { +// WebPageURL returns the Ollama web page URL of the OllamaModel. +func (om *OllamaModel) WebPageURL() *url.URL { u := &url.URL{ Scheme: om.Schema, Host: om.Registry, } - return u.JoinPath("v2", om.Namespace, om.Repository, "manifests", om.Tag) + return u.JoinPath(om.Namespace, om.Repository+":"+om.Tag) } -// WebURL returns the Ollama web URL of the OllamaModel. -func (om *OllamaModel) WebURL() *url.URL { +// Complete completes the OllamaModel with the given context and http client. +func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error { + if om.Client == nil { + om.Client = cli + } + u := &url.URL{ Scheme: om.Schema, Host: om.Registry, } - return u.JoinPath(om.Namespace, om.Repository+":"+om.Tag) -} + u = u.JoinPath("v2", om.Namespace, om.Repository, "manifests", om.Tag) -// Complete completes the OllamaModel with the given context and http client. -func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error { - req, err := httpx.NewGetRequestWithContext(ctx, om.URL().String()) + req, err := httpx.NewGetRequestWithContext(ctx, u.String()) if err != nil { return fmt.Errorf("new request: %w", err) } - err = httpx.Do(cli, req, func(resp *http.Response) error { + err = httpx.Do(om.Client, req, func(resp *http.Response) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("status code %d", resp.StatusCode) } return json.NewDecoder(resp.Body).Decode(om) }) if err != nil { - return fmt.Errorf("do request: %w", err) + return fmt.Errorf("do request %s: %w", u, err) } // Connect. - om.Config.model = om + om.Config.Root = om for i := range om.Layers { - om.Layers[i].model = om + om.Layers[i].Root = om } return nil } -// URL returns the URL of the OllamaModelLayer. -func (ol *OllamaModelLayer) URL() *url.URL { - if ol.model == nil { +// Params returns the parameters of the OllamaModel. +func (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (map[string]any, error) { + if cli == nil { + cli = om.Client + } + if cli == nil { + return nil, fmt.Errorf("no client") + } + + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.params$`)) + if len(mls) == 0 { + return nil, nil + } + + rs := make([]map[string]any, len(mls)) + eg, ctx := errgroup.WithContext(ctx) + for i := range mls { + x := i + eg.Go(func() error { + bs, err := mls[x].FetchBlob(ctx, cli) + if err == nil { + p := make(map[string]any) + if err = json.Unmarshal(bs, &p); err == nil { + rs[x] = p + } + } + return err + }) + } + if err := eg.Wait(); err != nil { + return nil, fmt.Errorf("fetch blob: %w", err) + } + + r := make(map[string]any) + for i := range rs { + for k, v := range rs[i] { + r[k] = v + } + } + return r, nil +} + +// Template returns the template of the OllamaModel. +func (om *OllamaModel) Template(ctx context.Context, cli *http.Client) (string, error) { + if cli == nil { + cli = om.Client + } + if cli == nil { + return "", fmt.Errorf("no client") + } + + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.(prompt|template)$`)) + if len(mls) == 0 { + return "", nil + } + + ml := mls[len(mls)-1] + bs, err := ml.FetchBlob(ctx, cli) + if err != nil { + return "", fmt.Errorf("fetch blob: %w", err) + } + return stringx.FromBytes(&bs), nil +} + +// System returns the system message of the OllamaModel. +func (om *OllamaModel) System(ctx context.Context, cli *http.Client) (string, error) { + if cli == nil { + cli = om.Client + } + if cli == nil { + return "", fmt.Errorf("no client") + } + + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.system$`)) + if len(mls) == 0 { + return "", nil + } + + ml := mls[len(mls)-1] + bs, err := ml.FetchBlob(ctx, cli) + if err != nil { + return "", fmt.Errorf("fetch blob: %w", err) + } + return stringx.FromBytes(&bs), nil +} + +// License returns the license of the OllamaModel. +func (om *OllamaModel) License(ctx context.Context, cli *http.Client) ([]string, error) { + if cli == nil { + cli = om.Client + } + if cli == nil { + return nil, fmt.Errorf("no client") + } + + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.license$`)) + if len(mls) == 0 { + return nil, nil + } + + rs := make([]string, len(mls)) + eg, ctx := errgroup.WithContext(ctx) + for i := range mls { + x := i + eg.Go(func() error { + bs, err := mls[x].FetchBlob(ctx, cli) + if err == nil { + rs[x] = stringx.FromBytes(&bs) + } + return err + }) + } + if err := eg.Wait(); err != nil { + return nil, fmt.Errorf("fetch blob: %w", err) + } + return rs, nil +} + +// Messages returns the messages of the OllamaModel. +func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error) { + if cli == nil { + cli = om.Client + } + if cli == nil { + return nil, fmt.Errorf("no client") + } + + mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.messages$`)) + if len(mls) == 0 { + return nil, nil + } + + rs := make([]json.RawMessage, len(mls)) + eg, ctx := errgroup.WithContext(ctx) + for i := range mls { + x := i + eg.Go(func() error { + bs, err := mls[x].FetchBlob(ctx, cli) + if err == nil { + rs[x] = bs + } + return err + }) + } + if err := eg.Wait(); err != nil { + return nil, fmt.Errorf("fetch blob: %w", err) + } + return rs, nil +} + +// BlobURL returns the blob URL of the OllamaModelLayer. +func (ol *OllamaModelLayer) BlobURL() *url.URL { + if ol.Root == nil { return nil } u := &url.URL{ - Scheme: ol.model.Schema, - Host: ol.model.Registry, + Scheme: ol.Root.Schema, + Host: ol.Root.Registry, + } + return u.JoinPath("v2", ol.Root.Namespace, ol.Root.Repository, "blobs", ol.Digest) +} + +// FetchBlob fetches the blob of the OllamaModelLayer with the given context and http client, +// and returns the response body as bytes. +func (ol *OllamaModelLayer) FetchBlob(ctx context.Context, cli *http.Client) ([]byte, error) { + var b []byte + err := ol.FetchBlobFunc(ctx, cli, func(resp *http.Response) error { + b = httpx.BodyBytes(resp) + return nil + }) + return b, err +} + +// FetchBlobFunc fetches the blob of the OllamaModelLayer with the given context and http client, +// and processes the response with the given function. +func (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *http.Client, process func(*http.Response) error) error { + if cli == nil { + cli = ol.Root.Client + } + if cli == nil { + return fmt.Errorf("no client") + } + + u := ol.BlobURL() + if u == nil { + return fmt.Errorf("no blob URL") + } + + req, err := httpx.NewGetRequestWithContext(ctx, u.String()) + if err != nil { + return fmt.Errorf("new request: %w", err) + } + + err = httpx.Do(cli, req, process) + if err != nil { + return fmt.Errorf("do request %s: %w", u, err) } - return u.JoinPath("v2", ol.model.Namespace, ol.model.Repository, "blobs", ol.Digest) + return nil } -// WebURL returns the Ollama web URL of the OllamaModelLayer. -func (ol *OllamaModelLayer) WebURL() *url.URL { - if ol.model == nil || len(ol.MediaType) < 12 { +// WebPageURL returns the Ollama web page URL of the OllamaModelLayer. +func (ol *OllamaModelLayer) WebPageURL() *url.URL { + if ol.Root == nil || len(ol.MediaType) < 12 { return nil } dg := strings.TrimPrefix(ol.Digest, "sha256:")[:12] u := &url.URL{ - Scheme: ol.model.Schema, - Host: ol.model.Registry, + Scheme: ol.Root.Schema, + Host: ol.Root.Registry, + } + return u.JoinPath(ol.Root.Namespace, ol.Root.Repository+":"+ol.Root.Tag, "blobs", dg) +} + +// FetchWebPage fetches the web page of the OllamaModelLayer with the given context and http client, +// and processes the response with the given function. +func (ol *OllamaModelLayer) FetchWebPage(ctx context.Context, cli *http.Client) (string, error) { + if cli == nil { + cli = ol.Root.Client + } + if cli == nil { + return "", fmt.Errorf("no client") + } + + u := ol.WebPageURL() + if u == nil { + return "", fmt.Errorf("no BlobURL") + } + + req, err := httpx.NewGetRequestWithContext(ctx, u.String()) + if err != nil { + return "", fmt.Errorf("new request: %w", err) + } + { + rus := ol.Root.WebPageURL().String() + req.Header.Add("Referer", rus) + req.Header.Add("Hx-Current-Url", rus) + req.Header.Add("Hx-Request", "true") + req.Header.Add("Hx-Target", "file-explorer") + } + + var n *html.Node + err = httpx.Do(cli, req, func(resp *http.Response) error { + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("status code %d", resp.StatusCode) + } + n, err = html.Parse(resp.Body) + if err != nil { + return fmt.Errorf("parse html: %w", err) + } + return nil + }) + if err != nil { + return "", fmt.Errorf("do request %s: %w", u, err) + } + + var wk func(*html.Node) string + wk = func(n *html.Node) string { + if n.Type == html.ElementNode && n.Data == "div" { + for i := range n.Attr { + if n.Attr[i].Key == "class" && n.Attr[i].Val == "whitespace-pre-wrap" { + return n.FirstChild.Data + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if r := wk(c); r != "" { + return r + } + } + return "" } - return u.JoinPath(ol.model.Namespace, ol.model.Repository+":"+ol.model.Tag, "blobs", dg) + + return wk(n), nil } diff --git a/util/anyx/any.go b/util/anyx/any.go new file mode 100644 index 0000000..8c74fa1 --- /dev/null +++ b/util/anyx/any.go @@ -0,0 +1,128 @@ +package anyx + +import ( + "encoding/json" + "fmt" + "strconv" + + "golang.org/x/exp/constraints" +) + +// Number converts any type to the specified number type. +func Number[T constraints.Integer | constraints.Float](v any) T { + switch vv := v.(type) { + case int: + return T(vv) + case int8: + return T(vv) + case int16: + return T(vv) + case int32: + return T(vv) + case int64: + return T(vv) + case uint: + return T(vv) + case uint8: + return T(vv) + case uint16: + return T(vv) + case uint32: + return T(vv) + case uint64: + return T(vv) + case float32: + return T(vv) + case float64: + return T(vv) + case bool: + if vv { + return T(1) + } + return T(0) + case string: + x, err := strconv.ParseInt(vv, 10, 64) + if err != nil { + y, err := strconv.ParseFloat(vv, 64) + if err != nil { + return T(0) + } else { + return T(y) + } + } + return T(x) + case json.Number: + x, err := vv.Int64() + if err != nil { + y, err := vv.Float64() + if err != nil { + return T(0) + } else { + return T(y) + } + } + return T(x) + default: + return T(0) + } +} + +// Bool converts any type to a bool. +func Bool(v any) bool { + switch vv := v.(type) { + case bool: + return vv + case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64, uintptr: + return vv != 0 + case float32, float64: + return vv != 0 + case string: + return vv != "0" + case fmt.Stringer: + return vv.String() != "0" + default: + return false + } +} + +// String converts any type to a string. +func String(v any) string { + switch vv := v.(type) { + case string: + return vv + case []byte: + return string(vv) + case int: + return strconv.FormatInt(int64(vv), 10) + case int8: + return strconv.FormatInt(int64(vv), 10) + case int16: + return strconv.FormatInt(int64(vv), 10) + case int32: + return strconv.FormatInt(int64(vv), 10) + case int64: + return strconv.FormatInt(vv, 10) + case uint: + return strconv.FormatUint(uint64(vv), 10) + case uint8: + return strconv.FormatUint(uint64(vv), 10) + case uint16: + return strconv.FormatUint(uint64(vv), 10) + case uint32: + return strconv.FormatUint(uint64(vv), 10) + case uint64: + return strconv.FormatUint(vv, 10) + case float32: + return strconv.FormatFloat(float64(vv), 'f', -1, 32) + case float64: + return strconv.FormatFloat(vv, 'f', -1, 64) + case bool: + return strconv.FormatBool(vv) + case fmt.Stringer: + return vv.String() + case json.RawMessage: + return string(vv) + default: + return fmt.Sprintf("%v", v) + } +} diff --git a/util/httpx/client.go b/util/httpx/client.go index 5d0c486..d925e05 100644 --- a/util/httpx/client.go +++ b/util/httpx/client.go @@ -246,5 +246,8 @@ func Do(cli *http.Client, req *http.Request, respFunc func(*http.Response) error return fmt.Errorf("do request: %w", err) } defer Close(resp) + if respFunc == nil { + return nil + } return respFunc(resp) } diff --git a/util/json/common.go b/util/json/common.go index ec77692..57a5406 100644 --- a/util/json/common.go +++ b/util/json/common.go @@ -10,6 +10,8 @@ type RawMessage = stdjson.RawMessage var ( MarshalIndent = stdjson.MarshalIndent Indent = stdjson.Indent + NewEncoder = stdjson.NewEncoder + Valid = stdjson.Valid ) // MustMarshal is similar to Marshal, diff --git a/util/json/jsoniter.go b/util/json/jsoniter.go index 6cd66c1..edb2af6 100644 --- a/util/json/jsoniter.go +++ b/util/json/jsoniter.go @@ -37,12 +37,11 @@ func init() { } } jsoniter.RegisterTypeDecoderFunc("interface {}", decodeNumberAsInt64IfPossible) + jsoniter.RegisterTypeDecoderFunc("any", decodeNumberAsInt64IfPossible) } var ( Marshal = json.Marshal Unmarshal = json.Unmarshal NewDecoder = json.NewDecoder - NewEncoder = json.NewEncoder - Valid = json.Valid ) diff --git a/util/json/stdjson.go b/util/json/stdjson.go index 602394e..d04966e 100644 --- a/util/json/stdjson.go +++ b/util/json/stdjson.go @@ -10,6 +10,4 @@ var ( Marshal = json.Marshal Unmarshal = json.Unmarshal NewDecoder = json.NewDecoder - NewEncoder = json.NewEncoder - Valid = json.Valid ) diff --git a/util/stringx/bytes.go b/util/stringx/bytes.go new file mode 100644 index 0000000..7433a2a --- /dev/null +++ b/util/stringx/bytes.go @@ -0,0 +1,14 @@ +package stringx + +import "unsafe" + +// FromBytes converts a byte slice to a string. +func FromBytes(b *[]byte) string { + return unsafe.String(unsafe.SliceData(*b), len(*b)) +} + +// ToBytes converts a string to a byte slice, +// which is impossible to modify the item of slice. +func ToBytes(s string) (bs []byte) { + return unsafe.Slice(unsafe.StringData(s), len(s)) +}