From a59b49fd974fa2f8f443c19450d8a3f6652e71bc Mon Sep 17 00:00:00 2001 From: BuildTools Date: Sat, 31 Aug 2024 14:01:41 -0700 Subject: [PATCH] chore: update llama.cpp convert scripts --- requirements.txt | 1 + src/convert_hf_to_gguf.py | 161 ++++++++++++------------------------ src/convert_lora_to_gguf.py | 91 +++++--------------- 3 files changed, 75 insertions(+), 178 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8e81420..ecd8764 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ python-dotenv~=1.0.1 safetensors~=0.4.4 setuptools~=68.2.0 huggingface-hub~=0.24.6 +transformers~=4.44.2 diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 0f120b7..6fb495a 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -69,6 +69,7 @@ class Model: model_name: str | None metadata_override: Path | None dir_model_card: Path + is_lora: bool model_arch: gguf.MODEL_ARCH @@ -86,6 +87,7 @@ def __init__( split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, + is_lora: bool = False, ): if type(self) is Model: raise TypeError( @@ -118,6 +120,7 @@ def __init__( self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model + self.is_lora = is_lora if self.ftype == gguf.LlamaFileType.GUESSED: @@ -381,6 +384,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.FFN_GATE_INP, gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.SSM_CONV1D, ) ) or not name.endswith(".weight") @@ -1831,7 +1835,10 @@ def prepare_tensors(self): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", "").lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + dim = self.hparams.get( + "head_dim", + self.hparams["hidden_size"] // self.hparams["num_attention_heads"], + ) freqs = 1.0 / ( base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) ) @@ -1860,10 +1867,11 @@ def prepare_tensors(self): ) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - self.gguf_writer.add_tensor( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - np.array(rope_factors, dtype=np.float32), - ) + if not self.is_lora: + self.gguf_writer.add_tensor( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + np.array(rope_factors, dtype=np.float32), + ) super().prepare_tensors() @@ -2472,14 +2480,15 @@ def set_gguf_parameters(self): f"The length of rope long and short factors must be {rope_dims / 2}" ) - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", - np.array(long_factors, dtype=np.float32), - ) - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", - np.array(short_factors, dtype=np.float32), - ) + if not self.is_lora: + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", + np.array(long_factors, dtype=np.float32), + ) + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", + np.array(short_factors, dtype=np.float32), + ) @Model.register("PlamoForCausalLM") @@ -3081,7 +3090,7 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("MambaForCausalLM", "MambaLMHeadModel") +@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA @@ -3117,6 +3126,10 @@ def set_gguf_parameters(self): self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 ) + use_dt_b_c_norm = False + + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): + use_dt_b_c_norm = True assert d_inner == 2 * d_model @@ -3124,12 +3137,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) self.gguf_writer.add_head_count(0) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3159,25 +3173,6 @@ def modify_tensors( return [(new_name, data_torch)] - def tensor_force_quant( - self, name: str, new_name: str, bid: int | None, n_dims: int - ) -> gguf.GGMLQuantizationType | bool: - if bid is not None and new_name in ( - self.format_tensor_name( - n, bid, ".weight" if name.endswith(".weight") else "" - ) - for n in [ - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SSM_X, - gguf.MODEL_TENSOR.SSM_DT, - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ] - ): - return gguf.GGMLQuantizationType.F32 - - return super().tensor_force_quant(name, new_name, bid, n_dims) - @Model.register("CohereForCausalLM") class CommandR2Model(Model): @@ -4301,7 +4296,10 @@ def prepare_tensors(self): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", "").lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + dim = self.hparams.get( + "head_dim", + self.hparams["hidden_size"] // self.hparams["num_attention_heads"], + ) freqs = 1.0 / ( base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) ) @@ -4330,10 +4328,11 @@ def prepare_tensors(self): ) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - self.gguf_writer.add_tensor( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - np.array(rope_factors, dtype=np.float32), - ) + if not self.is_lora: + self.gguf_writer.add_tensor( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + np.array(rope_factors, dtype=np.float32), + ) super().prepare_tensors() @@ -4403,82 +4402,26 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "--vocab-only", - action="store_true", - help="", - ) - parser.add_argument( - "--outfile", - type=Path, - help="", - ) + parser = argparse.ArgumentParser() + parser.add_argument("--vocab-only", action="store_true") + parser.add_argument("--outfile", type=Path) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="", - ) - parser.add_argument( - "--bigendian", - action="store_true", - help="", - ) - parser.add_argument( - "model", - type=Path, - help="", - ) - parser.add_argument( - "--use-temp-file", - action="store_true", - help="", - ) - parser.add_argument( - "--no-lazy", - action="store_true", - help="", - ) - parser.add_argument( - "--model-name", - type=str, - default=None, - help="", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="", - ) - parser.add_argument( - "--split-max-tensors", - type=int, - default=0, - help="", - ) - parser.add_argument( - "--split-max-size", - type=str, - default="0", - help="", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="", - ) - parser.add_argument( - "--no-tensor-first-split", - action="store_true", - help="", - ) - parser.add_argument( - "--metadata", - type=Path, - help="", ) + parser.add_argument("--bigendian", action="store_true") + parser.add_argument("model", type=Path) + parser.add_argument("--use-temp-file", action="store_true") + parser.add_argument("--no-lazy", action="store_true") + parser.add_argument("--model-name", type=str, default=None) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--split-max-tensors", type=int, default=0) + parser.add_argument("--split-max-size", type=str, default="0") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--no-tensor-first-split", action="store_true") + parser.add_argument("--metadata", type=Path) return parser.parse_args() diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index afee347..2cac648 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -28,7 +28,6 @@ sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) import gguf -# reuse model definitions from convert_hf_to_gguf.py from convert_hf_to_gguf import LazyTorchTensor, Model logger = logging.getLogger("lora-to-gguf") @@ -40,10 +39,9 @@ class PartialLoraTensor: B: Tensor | None = None -# magic to support tensor shape modifications and splitting class LoraTorchTensor: - _lora_A: Tensor # (n_rank, row_size) - _lora_B: Tensor # (col_size, n_rank) + _lora_A: Tensor + _lora_B: Tensor _rank: int def __init__(self, A: Tensor, B: Tensor): @@ -61,20 +59,14 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]: def __getitem__( self, - indices: ( - SupportsIndex - | slice - | tuple[ - SupportsIndex | slice | Tensor, ... - ] # TODO: add ellipsis in the type signature - ), + indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...], ) -> LoraTorchTensor: shape = self.shape if isinstance(indices, SupportsIndex): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) else: - raise NotImplementedError # can't return a vector + raise NotImplementedError elif isinstance(indices, slice): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) @@ -84,7 +76,7 @@ def __getitem__( assert len(indices) > 0 if indices[-1] is Ellipsis: return self[indices[:-1]] - # expand ellipsis + indices = tuple( u for v in ( @@ -104,7 +96,6 @@ def __getitem__( *(slice(None, None) for _ in range(len(indices), len(shape))), ) - # TODO: make sure this is correct indices_A = ( *( ( @@ -120,7 +111,7 @@ def __getitem__( indices_B = indices[:-1] return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) else: - raise NotImplementedError # unknown indice type + raise NotImplementedError @property def dtype(self) -> torch.dtype: @@ -143,9 +134,8 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: new_shape = cast(tuple[int, ...], shape) orig_shape = self.shape if len(new_shape) < 2: - raise NotImplementedError # can't become a vector + raise NotImplementedError - # expand -1 in the shape if any(dim == -1 for dim in new_shape): n_elems = prod(orig_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) @@ -155,7 +145,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: ) if new_shape[-1] != orig_shape[-1]: - raise NotImplementedError # can't reshape the row size trivially + raise NotImplementedError shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_B = (*new_shape[:-1], self._rank) @@ -174,7 +164,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: shape = self.shape dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) if dims[-1] == -1: - # TODO: support higher dimensional A shapes bigger than 1 + assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: @@ -182,7 +172,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: self._lora_B.permute(*dims), self._lora_A.permute(*dims) ) else: - # TODO: compose the above two + raise NotImplementedError def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: @@ -201,7 +191,7 @@ def to(self, *args, **kwargs): @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): - del types # unused + del types if kwargs is None: kwargs = {} @@ -245,58 +235,21 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: return base_name -def pyinstaller_include(): - # PyInstaller import - pass - - def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file" - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", - ) + parser = argparse.ArgumentParser() + parser.add_argument("--outfile", type=Path) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--bigendian", - action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "--no-lazy", - action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="increase output verbosity", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="only print out what will be done, without writing any new files", - ) - parser.add_argument( - "--base", - type=Path, - required=True, - help="directory containing base model file", - ) - parser.add_argument( - "lora_path", - type=Path, - help="directory containing LoRA adapter file", ) + parser.add_argument("--bigendian", action="store_true") + parser.add_argument("--no-lazy", action="store_true") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--base", type=Path, required=True) + parser.add_argument("lora_path", type=Path) return parser.parse_args() @@ -323,11 +276,11 @@ def parse_args() -> argparse.Namespace: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default + fname_out = dir_lora if os.path.exists(input_model): - # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file lora_model = load_file(input_model, device="cpu") @@ -335,7 +288,6 @@ def parse_args() -> argparse.Namespace: input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) - # load base model logger.info(f"Loading base model: {dir_base_model.name}") hparams = Model.load_hparams(dir_base_model) with torch.inference_mode(): @@ -431,6 +383,7 @@ def modify_tensors( dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, + is_lora=True, ) logger.info("Exporting model...")