feat: support model scope repo

Signed-off-by: thxCode <[email protected]>
gpustack · Jul 11, 2024 · 7b075b0 · 7b075b0
1 parent 1b3c92e
commit 7b075b0
Show file tree

Hide file tree

Showing 7 changed files with 204 additions and 38 deletions.
diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md
@@ -20,7 +20,7 @@ Usage of gguf-parser ...:
   -gpu-layers-step uint
         Specify the step of layers to offload, works with --gpu-layers.
   -hf-file string
-        Model file below the --repo, e.g. Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.
+        Model file below the --hf-repo, e.g. Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.
   -hf-repo string
         Repository of HuggingFace which the GGUF file store, e.g. NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.
   -in-max-ctx-size
@@ -33,6 +33,10 @@ Usage of gguf-parser ...:
         Output as pretty JSON. (default true)
   -kv-type string
         Specify the type of Key-Value cache, which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], default is f16. Use quantization type means enabling --flash-attention as well. (default "f16")
+  -ms-file string
+        Model file below the --ms-repo, e.g. qwen1.5-0.5b-chat.gguf.
+  -ms-repo string
+        Repository of ModelScope which the GGUF file store, e.g. qwen/Qwen1.5-0.5B-Chat-GGUF, works with --ms-file.
   -no-kv-offload
         Specify disabling Key-Value offloading, which is used to estimate the usage. Key-Value offloading can reduce the usage of VRAM.
   -no-mmap
@@ -172,6 +176,36 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
 
 ```
 
+#### Parse ModelScope GGUF file
+
+```shell
+$ gguf-parser --ms-repo="shaowenchen/chinese-alpaca-2-13b-16k-gguf" --ms-file="chinese-alpaca-2-13b-16k.Q5_K.gguf"
++--------------+------+-------+----------------+---------------+----------+------------+----------+
+|      \       | Name | Arch  |  Quantization  | Little Endian |   Size   | Parameters |   BPW    |
++--------------+------+-------+----------------+---------------+----------+------------+----------+
+|    MODEL     |  ..  | llama | IQ3_XXS/Q5_K_M |     true      | 8.76 GiB |  13.25 B   | 5.68 bpw |
++--------------+------+-------+----------------+---------------+----------+------------+----------+
+
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+|      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+| ARCHITECTURE |      16384      |     5120      |       1       |        N/A         |   40   |      13824       |     0      |     55296      |
++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+
++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+|      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+|  TOKENIZER   | llama | 769.83 KiB  |   55296    |       N/A        |     1     |     2     |      N/A      |       N/A       |      N/A      |
++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+
++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+
+|      \       | Arch  | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded |         UMA (RAM + VRAM)          | NonUMA RAM | NonUMA VRAM |
++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+
+|   ESTIMATE   | llama |    16384     |      false      |     true     |  41 (40 + 1)   |      Yes       | 61.18 MiB + 20.87 GiB = 20.92 GiB | 211.18 MiB |  22.74 GiB  |
++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+
+
+```
+
 #### Parse Ollama model
 
 ```shell

diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
@@ -31,6 +31,8 @@ func main() {
 		url     string
 		hfRepo  string
 		hfFile  string
+		msRepo  string
+		msFile  string
 		olModel string
 		olCrawl bool
 		olUsage bool
@@ -82,8 +84,12 @@ func main() {
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.")
 	fs.StringVar(&hfRepo, "hf-repo", hfRepo, "Repository of HuggingFace which the GGUF file store, e.g. "+
 		"NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.")
-	fs.StringVar(&hfFile, "hf-file", hfFile, "Model file below the --repo, e.g. "+
+	fs.StringVar(&hfFile, "hf-file", hfFile, "Model file below the --hf-repo, e.g. "+
 		"Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.")
+	fs.StringVar(&msRepo, "ms-repo", msRepo, "Repository of ModelScope which the GGUF file store, e.g. "+
+		"qwen/Qwen1.5-0.5B-Chat-GGUF, works with --ms-file.")
+	fs.StringVar(&msFile, "ms-file", msFile, "Model file below the --ms-repo, e.g. "+
+		"qwen1.5-0.5b-chat.gguf.")
 	fs.StringVar(&olModel, "ol-model", olModel, "Model name of Ollama, e.g. "+
 		"gemma2.")
 	fs.BoolVar(&olCrawl, "ol-crawl", olCrawl, "Crawl the Ollama model instead of blobs fetching, "+
@@ -242,6 +248,8 @@ func main() {
 			gf, err = ParseGGUFFileRemote(ctx, url, ropts...)
 		case hfRepo != "" && hfFile != "":
 			gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...)
+		case msRepo != "" && msFile != "":
+			gf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msFile, ropts...)
 		case olModel != "":
 			om := ParseOllamaModel(olModel)
 			gf, err = ParseGGUFFileFromOllamaModel(ctx, om, olCrawl, ropts...)

diff --git a/file_from_remote.go b/file_from_remote.go
@@ -11,12 +11,19 @@ import (
 	"github.com/thxcode/gguf-parser-go/util/osx"
 )
 
-// ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face,
+// ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face(https://huggingface.co/),
 // and returns a GGUFFile, or an error if any.
 func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {
 	return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://huggingface.co/%s/resolve/main/%s", repo, file), opts...)
 }
 
+// ParseGGUFFileFromModelScope parses a GGUF file from Model Scope(https://modelscope.cn/),
+// and returns a GGUFFile, or an error if any.
+func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {
+	opts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection())
+	return ParseGGUFFileRemote(ctx, fmt.Sprintf("https://modelscope.cn/models/%s/resolve/master/%s", repo, file), opts...)
+}
+
 // ParseGGUFFileRemote parses a GGUF file from a remote BlobURL,
 // and returns a GGUFFile, or an error if any.
 func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (*GGUFFile, error) {
@@ -65,16 +72,17 @@ func parseGGUFFileFromRemote(ctx context.Context, cli *http.Client, url string,
 			return nil, fmt.Errorf("new request: %w", err)
 		}
 
-		var sf *httpx.SeekerFile
-		if o.BufferSize > 0 {
-			sf, err = httpx.OpenSeekerFileWithSize(cli, req, o.BufferSize, 0)
-		} else {
-			sf, err = httpx.OpenSeekerFile(cli, req)
-		}
+		sf, err := httpx.OpenSeekerFile(cli, req,
+			httpx.SeekerFileOptions().
+				WithBufferSize(o.BufferSize).
+				If(o.SkipRangeDownloadDetection, func(x *httpx.SeekerFileOption) *httpx.SeekerFileOption {
+					return x.WithoutRangeDownloadDetect()
+				}))
 		if err != nil {
 			return nil, fmt.Errorf("open http file: %w", err)
 		}
 		defer osx.Close(sf)
+
 		f = io.NewSectionReader(sf, 0, sf.Len())
 		s = sf.Len()
 	}

diff --git a/file_option.go b/file_option.go
@@ -11,11 +11,12 @@ type (
 		MMap bool
 
 		// Remote.
-		ProxyURL            *url.URL
-		SkipProxy           bool
-		SkipTLSVerification bool
-		SkipDNSCache        bool
-		BufferSize          int
+		ProxyURL                   *url.URL
+		SkipProxy                  bool
+		SkipTLSVerification        bool
+		SkipDNSCache               bool
+		BufferSize                 int
+		SkipRangeDownloadDetection bool
 	}
 	GGUFReadOption func(o *_GGUFReadOptions)
 )
@@ -80,3 +81,10 @@ func UseBufferSize(size int) GGUFReadOption {
 		o.BufferSize = size
 	}
 }
+
+// SkipRangeDownloadDetection skips the range download detection when reading from remote.
+func SkipRangeDownloadDetection() GGUFReadOption {
+	return func(o *_GGUFReadOptions) {
+		o.SkipRangeDownloadDetection = true
+	}
+}
diff --git a/file_test.go b/file_test.go
@@ -207,6 +207,35 @@ func TestParseGGUFFileFromHuggingFace(t *testing.T) {
 	}
 }
 
+func TestParseGGUFFileFromModelScope(t *testing.T) {
+	ctx := context.Background()
+
+	cases := [][2]string{
+		{
+			"qwen/Qwen1.5-0.5B-Chat-GGUF",
+			"qwen1_5-0_5b-chat-q5_k_m.gguf",
+		},
+		{
+			"HIT-SCIR/huozi3-gguf",
+			"huozi3-q2_k.gguf",
+		},
+		{
+			"shaowenchen/chinese-alpaca-2-13b-16k-gguf",
+			"chinese-alpaca-2-13b-16k.Q5_K.gguf",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc[0]+"/"+tc[1], func(t *testing.T) {
+			f, err := ParseGGUFFileFromModelScope(ctx, tc[0], tc[1], SkipLargeMetadata())
+			if err != nil {
+				t.Fatal(err)
+				return
+			}
+			t.Log("\n", spew.Sdump(f), "\n")
+		})
+	}
+}
+
 func TestParseGGUFFileFromOllama(t *testing.T) {
 	ctx := context.Background()
 

diff --git a/util/httpx/file.go b/util/httpx/file.go
@@ -21,11 +21,9 @@ type SeekerFile struct {
 	l   int64
 }
 
-func OpenSeekerFile(cli *http.Client, req *http.Request) (*SeekerFile, error) {
-	return OpenSeekerFileWithSize(cli, req, 0, 0)
-}
-
-func OpenSeekerFileWithSize(cli *http.Client, req *http.Request, bufSize, size int) (*SeekerFile, error) {
+// OpenSeekerFile tries the GET http.Request as a SeekerFile,
+// and returns a SeekerFile, or an error if any.
+func OpenSeekerFile(cli *http.Client, req *http.Request, opts ...*SeekerFileOption) (*SeekerFile, error) {
 	if cli == nil {
 		return nil, errors.New("client is nil")
 	}
@@ -36,37 +34,57 @@ func OpenSeekerFileWithSize(cli *http.Client, req *http.Request, bufSize, size i
 		return nil, errors.New("request method is not GET")
 	}
 
+	var o *SeekerFileOption
+	if len(opts) > 0 {
+		o = opts[0]
+	} else {
+		o = SeekerFileOptions()
+	}
+	if o.bufSize <= 0 {
+		o.bufSize = 4 * 1024 * 1024 // 4mb
+	}
+
 	var l int64
 	{
-		req := req.Clone(req.Context())
-		req.Method = http.MethodHead
-		err := Do(cli, req, func(resp *http.Response) error {
-			if resp.StatusCode != http.StatusOK {
-				return fmt.Errorf("stat: status code %d", resp.StatusCode)
+		if !o.skipRangeDownloadDetect {
+			req := req.Clone(req.Context())
+			req.Method = http.MethodHead
+			err := Do(cli, req, func(resp *http.Response) error {
+				if resp.StatusCode != http.StatusOK {
+					return fmt.Errorf("stat: status code %d", resp.StatusCode)
+				}
+				if !strings.EqualFold(resp.Header.Get("Accept-Ranges"), "bytes") {
+					return fmt.Errorf("stat: not support range download")
+				}
+				l = resp.ContentLength
+				return nil
+			})
+			if err != nil {
+				return nil, fmt.Errorf("stat: do head request: %w", err)
 			}
-			if !strings.EqualFold(resp.Header.Get("Accept-Ranges"), "bytes") {
-				return fmt.Errorf("stat: not support range download")
+		} else {
+			req := req.Clone(req.Context())
+			err := Do(cli, req, func(resp *http.Response) error {
+				if resp.StatusCode != http.StatusOK {
+					return fmt.Errorf("stat: status code %d", resp.StatusCode)
+				}
+				l = resp.ContentLength
+				return nil
+			})
+			if err != nil {
+				return nil, fmt.Errorf("stat: do get request: %w", err)
 			}
-			l = resp.ContentLength
-			return nil
-		})
-		if err != nil {
-			return nil, fmt.Errorf("stat: do head request: %w", err)
 		}
-		switch sz := int64(size); {
+		switch sz := int64(o.size); {
 		case sz > l:
-			return nil, fmt.Errorf("size %d is greater than limit %d", size, l)
+			return nil, fmt.Errorf("size %d is greater than limit %d", o.size, l)
 		case sz <= 0:
 		default:
 			l = sz
 		}
 	}
 
-	if bufSize <= 0 {
-		bufSize = 4 * 1024 * 1024 // 4mb
-	}
-
-	b := ringbuffer.New(bufSize).WithCancel(req.Context())
+	b := ringbuffer.New(o.bufSize).WithCancel(req.Context())
 	return &SeekerFile{cli: cli, req: req, b: b, c: 1<<63 - 1, l: l}, nil
 }
 

diff --git a/util/httpx/file_options.go b/util/httpx/file_options.go
@@ -0,0 +1,61 @@
+package httpx
+
+type SeekerFileOption struct {
+	bufSize                 int
+	size                    int
+	skipRangeDownloadDetect bool
+}
+
+func SeekerFileOptions() *SeekerFileOption {
+	return &SeekerFileOption{
+		bufSize: 4 * 1024 * 1024, // 4mb
+	}
+}
+
+// WithBufferSize sets the size of the buffer to read the file,
+//
+// Default is 4mb.
+func (o *SeekerFileOption) WithBufferSize(bufSize int) *SeekerFileOption {
+	if o == nil || bufSize <= 0 {
+		return o
+	}
+	o.bufSize = bufSize
+	return o
+}
+
+// WithSize sets the size of the file to read,
+//
+// If the size is greater than the content size of the file, it will return an error.
+func (o *SeekerFileOption) WithSize(size int) *SeekerFileOption {
+	if o == nil || size <= 0 {
+		return o
+	}
+	o.size = size
+	return o
+}
+
+// WithoutRangeDownloadDetect disables range download detection.
+//
+// Usually, OpenSeekerFile sends a "HEAD" HTTP request to destination to get the content size from the "Content-Length" header,
+// and confirms whether supports range download via the "Accept-Ranges" header.
+// However, some servers may not support the "HEAD" method, or the "Accept-Ranges" header is not set correctly.
+//
+// With this option, OpenSeekerFile sends "GET" HTTP request to get the content size as usual,
+// and does not confirm whether supports range download. But during the seeking read,
+// it still uses the "Range" header to read the file.
+func (o *SeekerFileOption) WithoutRangeDownloadDetect() *SeekerFileOption {
+	if o == nil {
+		return o
+	}
+	o.skipRangeDownloadDetect = true
+	return o
+}
+
+// If is a conditional option,
+// which receives a boolean condition to trigger the given function or not.
+func (o *SeekerFileOption) If(condition bool, then func(*SeekerFileOption) *SeekerFileOption) *SeekerFileOption {
+	if condition {
+		return then(o)
+	}
+	return o
+}