ggerganov · compilade · Jul 7, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -89,6 +89,22 @@ let
       ps.tiktoken
       ps.torchWithoutCuda
       ps.transformers
+
+      # server bench
+      ps.matplotlib
+
+      # server tests
+      ps.openai
+      ps.behave
+      ps.prometheus-client
+
+      # for examples/pydantic-models-to-grammar-examples.py
+      ps.docstring-parser
+      ps.pydantic
+
+      # for scripts/compare-llama-bench.py
+      ps.gitpython
+      ps.tabulate
     ]
   );
 

diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
@@ -0,0 +1,40 @@
+name: Python Type-Check
+
+on:
+  push:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - '**.py'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
+  pull_request:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - '**.py'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  python-type-check:
+    runs-on: ubuntu-latest
+    name: pyright type-check
+    steps:
+      - name: Check out source repository
+        uses: actions/checkout@v4
+      - name: Set up Python environment
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+      - name: Install Python dependencies
+        # TODO: use a venv
+        run: pip install -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.370
+          warnings: true
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -265,7 +265,7 @@ def write_tensors(self):
                     break
 
             for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
-                data: np.ndarray = data  # type hint
+                data: np.ndarray  # type hint
                 n_dims = len(data.shape)
                 data_dtype = data.dtype
                 data_qtype: gguf.GGMLQuantizationType | None = None
@@ -596,10 +596,6 @@ def _create_vocab_sentencepiece(self):
 
         tokenizer_path = self.dir_model / 'tokenizer.model'
 
-        tokens: list[bytes] = []
-        scores: list[float] = []
-        toktypes: list[int] = []
-
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
@@ -2117,7 +2113,7 @@ def set_vocab(self):
             logger.error(f'Error: Missing {tokenizer_path}')
             sys.exit(1)
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 
@@ -2969,16 +2965,16 @@ def set_vocab(self):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
         # some models like Pile-T5 family use BPE tokenizer instead of Unigram
-        if sentencepiece_model.trainer_spec.model_type == 2: # BPE
+        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
             # assure the tokenizer model file name is correct
             assert tokenizer_path.name == 'tokenizer.model'
             return self._set_vocab_sentencepiece()
         else:
-            assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
         remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
@@ -3149,7 +3145,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # but Jais's PyTorch model simply precalculates the slope values and places them
             # in relative_pes.slopes
             n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
-            first_val = float(data_torch._data[0])
+            first_val = float(data_torch[0].item())
             self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
 
             return tensors

diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py
@@ -354,7 +354,8 @@ def add_tensors(self, gguf_writer):
 
 
 def handle_metadata(cfg, hp):
-    import convert
+    import examples.convert_legacy_llama as convert
+
     assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
     hf_config_path   = cfg.model_metadata_dir / "config.json"
     orig_config_path = cfg.model_metadata_dir / "params.json"

diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
@@ -353,7 +353,7 @@ class Metadata:
     version: Optional[str] = None
     url: Optional[str] = None
     description: Optional[str] = None
-    licence: Optional[str] = None
+    license: Optional[str] = None
     source_url: Optional[str] = None
     source_hf_repo: Optional[str] = None
 
@@ -492,12 +492,13 @@ def validate_conversion_to(self, data_type: DataType) -> None:
 
 LazyModel: TypeAlias = 'dict[str, LazyTensor]'
 
+ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
 
 @dataclass
 class ModelPlus:
     model: LazyModel
     paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
+    format: ModelFormat
     vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
 
 
@@ -536,7 +537,7 @@ def load() -> UnquantizedTensor:
 
 
 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
+    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
     assert len(formats) == 1, "different formats?"
     format = formats.pop()
     paths = [path for mp in models_plus for path in mp.paths]
@@ -555,7 +556,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
     else:
         model = merge_sharded([mp.model for mp in models_plus])
 
-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)
 
 
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -805,7 +806,7 @@ class OutputFile:
     def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
         # Metadata About The Model And Its Provenence
         name = "LLaMA"
         if metadata is not None and metadata.name is not None:
@@ -827,8 +828,8 @@ def add_meta_model(self, params: Params, metadata: Metadata) -> None:
                 self.gguf.add_url(metadata.url)
             if metadata.description is not None:
                 self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
-                self.gguf.add_licence(metadata.licence)
+            if metadata.license is not None:
+                self.gguf.add_licence(metadata.license)
             if metadata.source_url is not None:
                 self.gguf.add_source_url(metadata.source_url)
             if metadata.source_hf_repo is not None:
@@ -943,7 +944,7 @@ def close(self) -> None:
     @staticmethod
     def write_vocab_only(
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
@@ -977,7 +978,7 @@ def write_all(
         fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
         concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
-        metadata: Metadata = None,
+        metadata: Metadata | None = None,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
@@ -1396,6 +1397,8 @@ def main(args_in: list[str] | None = None) -> None:
     if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
         vocab = model_plus.vocab
 
+    assert params is not None
+
     logger.info(f"Vocab info: {vocab}")
     logger.info(f"Special vocab info: {special_vocab}")
     model   = model_plus.model

diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
@@ -74,7 +74,7 @@ def __init__(self, dtype='f', ne=None):
             if len(self.ne) == 0:
                 self.nbytes = 0
             else:
-                self.nbytes = int(np.product(self.ne)) * 4
+                self.nbytes = int(np.prod(self.ne)) * 4
         else:
             raise ValueError(f"Unhandled data type '{self.dtype}'")
 

diff --git a/examples/json_schema_pydantic_example.py b/examples/json_schema_pydantic_example.py
@@ -3,7 +3,7 @@
 #! pip install pydantic
 #! python json_schema_pydantic_example.py
 
-from pydantic import BaseModel, Extra, TypeAdapter
+from pydantic import BaseModel, Field, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@@ -17,6 +17,9 @@ def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1
 
         The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
         '''
+        response_format = None
+        type_adapter = None
+
         if response_model:
             type_adapter = TypeAdapter(response_model)
             schema = type_adapter.json_schema()

diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
 import argparse
 import itertools
 import json
@@ -188,7 +190,7 @@ def uniform_range(from_str: str, to_str: str):
     raise RuntimeError("At least one of min_value or max_value must be set")
 
 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: list | None = None):
         self.content = content
         self.deps = deps or []
 
@@ -248,7 +250,7 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
 
     def _format_literal(self, literal):
         escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
         )
         return f'"{escaped}"'
 
@@ -403,11 +405,11 @@ def _visit_pattern(self, pattern, name):
         i = 0
         length = len(pattern)
 
-        def to_rule(s: Tuple[str, bool]) -> str:
+        def to_rule(s: tuple[str, bool]) -> str:
             (txt, is_literal) = s
             return "\"" + txt + "\"" if is_literal else txt
 
-        def transform() -> Tuple[str, bool]:
+        def transform() -> tuple[str, bool]:
             '''
                 Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
             '''
@@ -420,7 +422,7 @@ def transform() -> Tuple[str, bool]:
             # We only need a flat structure here to apply repetition operators to the last item, and
             # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
             # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[Tuple[str, bool]] = []
+            seq: list[tuple[str, bool]] = []
 
             def get_dot():
                 if self._dotall:

diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py
@@ -185,6 +185,8 @@ def bytes_to_unicode():
     fout.add_description("two-tower CLIP model")
 
 if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
     # text_model hparams
     fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
     fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@@ -259,8 +261,8 @@ def bytes_to_unicode():
 
 
     if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
     else:
         image_mean = args.image_mean if args.image_mean is not None else default_image_mean
         image_std = args.image_std if args.image_std is not None else default_image_std
@@ -272,7 +274,7 @@ def bytes_to_unicode():
 
 
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
@@ -286,7 +288,7 @@ def bytes_to_unicode():
 
     print("Projector tensors added\n")
 
-state_dict = model.state_dict()
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
 for name, data in state_dict.items():
     if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
         # we don't need this

diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py
@@ -2,7 +2,9 @@
 import glob
 import os
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
 
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
@@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
 def load_model(file_path):
     if is_safetensor_file(file_path):
         tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
             for key in f.keys():
                 tensors[key] = f.get_tensor(key).clone()
                 # output shape
@@ -134,7 +136,7 @@ def proj_criteria(checkpoint):
     if last_checkpoint is not None:
         for k, v in last_checkpoint.items():
             print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
     print("No tensors found. Is this a LLaVA model?")
     exit()
 
@@ -143,8 +145,10 @@ def proj_criteria(checkpoint):
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
+    assert last_checkpoint is not None
     projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
+    assert first_checkpoint is not None
     projector[name] = first_checkpoint[name].float()
 
 if len(projector) > 0:

diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
@@ -1,3 +1,4 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1