feat: parse ollama model

Signed-off-by: thxCode <[email protected]>
gpustack · Jul 4, 2024 · c293abd · c293abd
1 parent b541748
commit c293abd
Show file tree

Hide file tree

Showing 21 changed files with 1,166 additions and 107 deletions.
diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md
@@ -37,6 +37,10 @@ Usage of gguf-parser ...:
         Specify how many layers to offload, which is used to estimate the usage, default is full offloaded. [Deprecated, use --gpu-layers instead] (default -1)
   -offload-layers-step uint
         Specify the step of layers to offload, works with --offload-layers. [Deprecated, use --gpu-layers-step instead]
+  -ol-crawl
+        Crawl the Ollama model instead of blobs fetching, which will be more efficient and faster, but lossy.
+  -ol-model string
+        Model name of Ollama, e.g. gemma2.
   -parallel-size int
         Specify the number of parallel sequences to decode, which is used to estimate the usage, default is 1. (default 1)
   -path string
@@ -61,6 +65,7 @@ Usage of gguf-parser ...:
         Url where the GGUF file to load, e.g. https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. Note that gguf-parser does not need to download the entire GGUF file.
   -version
         Show gguf-parser version.
+
 ```
 
 ### Parse
@@ -155,6 +160,62 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
 
 ```
 
+#### Parse Ollama model
+
+```shell
+$ gguf-parser --ol-model="gemma2"
++-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+| MODEL |  NAME  |  ARCH  | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN |   SIZE   | PARAMETERS |   BPW    |
++       +--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+|       | gemma2 | gemma2 |          2           |   Q4_0    |     true      | 5.06 GiB |   9.24 B   | 4.71 bpw |
++-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++              +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+|              |      8192       |     3584      |     2048      |         16         |   42   |      14336       |     0      |     256000     |
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+
++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
++           +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+|           | llama |  3.80 MiB   |   256000   |        0         |     2     |     1     |       3       |       N/A       |       0       |
++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+
++----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+| ESTIMATE |  ARCH  | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM  | NONUMA RAM | NONUMA VRAM |
++          +--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+|          | gemma2 |     8192     |      false      |     true     |  43 (42 + 1)   |      Yes       | 2.69 GiB | 215.97 MiB |  8.43 GiB   |
++----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+
+
+$ gguf-parser --ol-model="gemma2" --ol-crawl
++-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+| MODEL |  NAME  |  ARCH  | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN |   SIZE   | PARAMETERS |   BPW    |
++       +--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+|       | gemma2 | gemma2 |          2           |   Q4_0    |     true      | 5.06 GiB |   9.24 B   | 4.71 bpw |
++-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+
+
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
++              +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+|              |      8192       |     3584      |     2048      |         16         |   42   |      14336       |     0      |     256000     |
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+
++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
++           +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+|           | llama |     0 B     |   256000   |        0         |     2     |     1     |       3       |       N/A       |       0       |
++-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+
++----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+| ESTIMATE |  ARCH  | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM  | NONUMA RAM | NONUMA VRAM |
++          +--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+|          | gemma2 |     8192     |      false      |     true     |  43 (42 + 1)   |      Yes       | 2.69 GiB | 215.99 MiB |  8.12 GiB   |
++----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
+
+```
+
 ### Estimate
 
 #### Estimate with zero layers offload

diff --git a/cmd/gguf-parser/go.mod b/cmd/gguf-parser/go.mod
@@ -11,10 +11,15 @@ require (
 
 require (
 	github.com/henvic/httpretty v0.1.3 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/mattn/go-runewidth v0.0.9 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
 	github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b // indirect
 	golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
 	golang.org/x/mod v0.17.0 // indirect
+	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sync v0.7.0 // indirect
 	golang.org/x/sys v0.20.0 // indirect
 	golang.org/x/tools v0.21.0 // indirect

diff --git a/cmd/gguf-parser/go.sum b/cmd/gguf-parser/go.sum
diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
@@ -8,10 +8,11 @@ import (
 	"strconv"
 	"strings"
 	"sync"
-	stdjson "encoding/json"
 
 	"github.com/olekukonko/tablewriter"
 
+	"github.com/thxcode/gguf-parser-go/util/json"
+
 	. "github.com/thxcode/gguf-parser-go"
 )
 
@@ -24,9 +25,12 @@ func main() {
 
 	var (
 		// model options
-		path       string
-		url        string
-		repo, file string
+		path    string
+		url     string
+		hfRepo  string
+		hfFile  string
+		olModel string
+		olCrawl bool
 		// read options
 		debug         bool
 		skipTLSVerify bool
@@ -47,8 +51,8 @@ func main() {
 		skipTokenizer    bool
 		skipEstimate     bool
 		inMib            bool
-		json             bool
-		jsonPretty       = true
+		inJson           bool
+		inPrettyJson     = true
 	)
 	fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
 	fs.Usage = func() {
@@ -62,14 +66,18 @@ func main() {
 		"https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF"+
 		"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. "+
 		"Note that gguf-parser does not need to download the entire GGUF file.")
-	fs.StringVar(&repo, "repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+
+	fs.StringVar(&hfRepo, "repo", hfRepo, "Repository of HuggingFace which the GGUF file store, e.g. "+
 		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file. [Deprecated, use --hf-repo instead]")
-	fs.StringVar(&file, "file", file, "Model file below the --repo, e.g. "+
+	fs.StringVar(&hfFile, "file", hfFile, "Model file below the --repo, e.g. "+
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. [Deprecated, use --hf-file instead]") // Deprecated.
-	fs.StringVar(&repo, "hf-repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+
+	fs.StringVar(&hfRepo, "hf-repo", hfRepo, "Repository of HuggingFace which the GGUF file store, e.g. "+
 		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.") // Deprecated.
-	fs.StringVar(&file, "hf-file", file, "Model file below the --repo, e.g. "+
+	fs.StringVar(&hfFile, "hf-file", hfFile, "Model file below the --repo, e.g. "+
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.")
+	fs.StringVar(&olModel, "ol-model", olModel, "Model name of Ollama, e.g. "+
+		"gemma2.")
+	fs.BoolVar(&olCrawl, "ol-crawl", olCrawl, "Crawl the Ollama model instead of blobs fetching, "+
+		"which will be more efficient and faster, but lossy.")
 	fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.")
 	fs.BoolVar(&skipTLSVerify, "skip-tls-verify", skipTLSVerify, "Skip TLS verification, works with --url.")
 	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+
@@ -113,8 +121,8 @@ func main() {
 	fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata")
 	fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip to estimate.")
 	fs.BoolVar(&inMib, "in-mib", inMib, "Display the estimated result in table with MiB.")
-	fs.BoolVar(&json, "json", json, "Output as JSON.")
-	fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON.")
+	fs.BoolVar(&inJson, "json", inJson, "Output as JSON.")
+	fs.BoolVar(&inPrettyJson, "json-pretty", inPrettyJson, "Output as pretty JSON.")
 	if err := fs.Parse(os.Args[1:]); err != nil {
 		fmt.Println(err.Error())
 		os.Exit(1)
@@ -192,8 +200,10 @@ func main() {
 			gf, err = ParseGGUFFile(path, ropts...)
 		case url != "":
 			gf, err = ParseGGUFFileRemote(ctx, url, ropts...)
-		case repo != "" && file != "":
-			gf, err = ParseGGUFFileFromHuggingFace(ctx, repo, file, ropts...)
+		case hfRepo != "" && hfFile != "":
+			gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...)
+		case olModel != "":
+			gf, err = ParseGGUFFileFromOllama(ctx, olModel, olCrawl, ropts...)
 		}
 		if err != nil {
 			_, _ = fmt.Fprintf(os.Stderr, "failed to parse GGUF file: %s\n", err.Error())
@@ -244,7 +254,7 @@ func main() {
 		}
 	}
 
-	if json {
+	if inJson {
 		o := map[string]any{}
 		if !skipModel {
 			o["model"] = m
@@ -286,8 +296,8 @@ func main() {
 			o["estimate"] = es
 		}
 
-		enc := stdjson.NewEncoder(os.Stdout)
-		if jsonPretty {
+		enc := json.NewEncoder(os.Stdout)
+		if inPrettyJson {
 			enc.SetIndent("", "  ")
 		}
 		if err := enc.Encode(o); err != nil {

diff --git a/file.go b/file.go
@@ -2,22 +2,18 @@ package gguf_parser
 
 import (
 	"bytes"
-	"context"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
-	"net/http"
 	"regexp"
 	"strconv"
 	"strings"
-	"time"
 
 	"golang.org/x/exp/constraints"
 
 	"github.com/thxcode/gguf-parser-go/util/bytex"
 	"github.com/thxcode/gguf-parser-go/util/funcx"
-	"github.com/thxcode/gguf-parser-go/util/httpx"
 	"github.com/thxcode/gguf-parser-go/util/osx"
 )
 
@@ -36,10 +32,14 @@ type GGUFFile struct {
 	TensorInfos GGUFTensorInfos `json:"tensorInfos"`
 	// Padding is the padding size of the GGUF file,
 	// which is used to split Header and TensorInfos from tensor data.
+	//
+	// This might be empty if parse from crawler.
 	Padding int64 `json:"padding"`
 	// TensorDataStartOffset is the offset in bytes of the tensor data in this file.
 	//
 	// The offset is the start of the file.
+	//
+	// This might be lossy if parse from crawler.
 	TensorDataStartOffset int64 `json:"tensorDataStartOffset"`
 
 	/* Appendix */
@@ -151,17 +151,21 @@ type (
 		Len uint64 `json:"len"`
 		// Array holds all array items.
 		//
-		// Array may be empty if skipping.
+		// This might be empty if skipping or parse from crawler.
 		Array []any `json:"array,omitempty"`
 
 		/* Appendix */
 
 		// StartOffset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file.
 		//
 		// The offset is the start of the file.
+		//
+		// This might be empty if parse from crawler.
 		StartOffset int64 `json:"startOffset"`
 
 		// Size is the size of the array in bytes.
+		//
+		// This might be empty if parse from crawler.
 		Size int64 `json:"endOffset"`
 	}
 
@@ -195,6 +199,8 @@ type (
 		// StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file.
 		//
 		// The offset is the start of the file.
+		//
+		// This might be empty if parse from crawler.
 		StartOffset int64 `json:"startOffset"`
 	}
 
@@ -237,70 +243,6 @@ func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error) {
 	return parseGGUFFile(s, f, o)
 }
 
-// ParseGGUFFileRemote parses a GGUF file from a remote URL,
-// and returns a GGUFFile, or an error if any.
-func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (*GGUFFile, error) {
-	var o _GGUFReadOptions
-	for _, opt := range opts {
-		opt(&o)
-	}
-
-	cli := httpx.Client(
-		httpx.ClientOptions().
-			WithUserAgent("gguf-parser-go").
-			If(o.Debug, func(x *httpx.ClientOption) *httpx.ClientOption {
-				return x.WithDebug()
-			}).
-			WithTimeout(0).
-			WithTransport(
-				httpx.TransportOptions().
-					WithoutKeepalive().
-					TimeoutForDial(5*time.Second).
-					TimeoutForTLSHandshake(5*time.Second).
-					TimeoutForResponseHeader(5*time.Second).
-					If(o.SkipProxy, func(x *httpx.TransportOption) *httpx.TransportOption {
-						return x.WithoutProxy()
-					}).
-					If(o.ProxyURL != nil, func(x *httpx.TransportOption) *httpx.TransportOption {
-						return x.WithProxy(http.ProxyURL(o.ProxyURL))
-					}).
-					If(o.SkipTLSVerification, func(x *httpx.TransportOption) *httpx.TransportOption {
-						return x.WithoutInsecureVerify()
-					})))
-
-	var (
-		f io.ReadSeeker
-		s int64
-	)
-	{
-		req, err := httpx.NewGetRequestWithContext(ctx, url)
-		if err != nil {
-			return nil, fmt.Errorf("new request: %w", err)
-		}
-
-		var sf *httpx.SeekerFile
-		if o.BufferSize > 0 {
-			sf, err = httpx.OpenSeekerFileWithSize(cli, req, o.BufferSize, 0)
-		} else {
-			sf, err = httpx.OpenSeekerFile(cli, req)
-		}
-		if err != nil {
-			return nil, fmt.Errorf("open http file: %w", err)
-		}
-		defer osx.Close(sf)
-		f = io.NewSectionReader(sf, 0, sf.Len())
-		s = sf.Len()
-	}
-
-	return parseGGUFFile(s, f, o)
-}
-
-// ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face,
-// and returns a GGUFFile, or an error if any.
-func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {
-	return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://huggingface.co/%s/resolve/main/%s", repo, file), opts...)
-}
-
 func parseGGUFFile(s int64, f io.ReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, err error) {
 	var gf GGUFFile
 	var bo binary.ByteOrder = binary.LittleEndian