refactor: set config into weights for quantization feature support mo…

…re easily (#400) Co-authored-by: LS <LS>
predibase · Apr 10, 2024 · 70db455 · 70db455
1 parent 67d5357
commit 70db455
Show file tree

Hide file tree

Showing 18 changed files with 42 additions and 71 deletions.
diff --git a/server/lorax_server/models/bloom.py b/server/lorax_server/models/bloom.py
@@ -87,8 +87,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device=device, dtype=dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = BloomForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/custom_modeling/flash_santacoder_modeling.py b/server/lorax_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -63,7 +63,7 @@ def _load_multi_mqa_gptq(config, prefix: str, weights, bias: bool, head_size, nu
 
         g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
         g_idx = g_idx.to(device=weights.device)
-        bits, groupsize = weights._get_gptq_params()
+        bits, groupsize = weights._get_bits_and_groupsize()
 
         from lorax_server.utils.layers import HAS_EXLLAMA
 

diff --git a/server/lorax_server/models/flash_gemma.py b/server/lorax_server/models/flash_gemma.py
@@ -64,9 +64,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = GemmaForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_gpt2.py b/server/lorax_server/models/flash_gpt2.py
@@ -66,9 +66,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashGPT2ForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_llama.py b/server/lorax_server/models/flash_llama.py
@@ -73,9 +73,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashLlamaForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_mistral.py b/server/lorax_server/models/flash_mistral.py
@@ -71,9 +71,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashMistralForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_mixtral.py b/server/lorax_server/models/flash_mixtral.py
@@ -71,9 +71,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashMixtralForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_neox.py b/server/lorax_server/models/flash_neox.py
@@ -49,8 +49,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device=device, dtype=dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashGPTNeoXForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_phi.py b/server/lorax_server/models/flash_phi.py
@@ -69,9 +69,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashPhiForCausalLM(config, weights)
         self.config = config

diff --git a/server/lorax_server/models/flash_qwen.py b/server/lorax_server/models/flash_qwen.py
@@ -69,9 +69,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashQwenForCausalLM(config, weights)
         self.config = config

diff --git a/server/lorax_server/models/flash_qwen2.py b/server/lorax_server/models/flash_qwen2.py
@@ -79,9 +79,7 @@ def __init__(
             dtype,
             process_group=self.process_group,
         )
-
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashQwen2ForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_rw.py b/server/lorax_server/models/flash_rw.py
@@ -45,6 +45,7 @@ def __init__(
         )
 
         config = RWConfig.from_pretrained(model_id, revision=revision, trust_remote_code=trust_remote_code)
+        config.quantize = quantize
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
@@ -55,10 +56,7 @@ def __init__(
             process_group=self.process_group,
             aliases={"transformer.word_embeddings.weight": ["lm_head.weight"]},
         )
-
-        config.quantize = quantize
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashRWForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/flash_santacoder.py b/server/lorax_server/models/flash_santacoder.py
@@ -60,8 +60,7 @@ def __init__(
             process_group=self.process_group,
             aliases={"transformer.wte.weight": ["lm_head.weight"]},
         )
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = FlashSantacoderForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/galactica.py b/server/lorax_server/models/galactica.py
@@ -189,8 +189,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device=device, dtype=dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = OPTForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/gpt_neox.py b/server/lorax_server/models/gpt_neox.py
@@ -58,8 +58,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device=device, dtype=dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = GPTNeoxForCausalLM(config, weights)
 

diff --git a/server/lorax_server/models/mpt.py b/server/lorax_server/models/mpt.py
@@ -81,8 +81,7 @@ def __init__(
 
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         config.quantize = quantize
         model = MPTForCausalLM(config, weights)

diff --git a/server/lorax_server/models/opt.py b/server/lorax_server/models/opt.py
@@ -56,8 +56,7 @@ def __init__(
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device=device, dtype=dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            weights._set_gptq_params(model_id)
+        weights._set_config(model_id, config)
 
         model = OPTForCausalLM(config, weights)
 

diff --git a/server/lorax_server/utils/weights.py b/server/lorax_server/utils/weights.py
@@ -9,7 +9,7 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError, LocalEntryNotFoundError
 from loguru import logger
-from safetensors import SafetensorError, safe_open
+from safetensors import safe_open
 
 
 class AbstractWeights(ABC):
@@ -224,7 +224,7 @@ def get_multi_weights_col(self, prefixes: List[Union[str, Tuple]], quantize: str
             else:
                 g_idx = None
 
-            bits, groupsize = self._get_gptq_params()
+            bits, groupsize = self._get_bits_and_groupsize()
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
         else:
             w = self.get_sharded_list("weight", prefixes, dim=0)
@@ -234,7 +234,7 @@ def get_multi_weights_col(self, prefixes: List[Union[str, Tuple]], quantize: str
     def get_multi_weights_row(self, prefix: str, quantize: str):
         if quantize == "gptq":
             use_exllama = True
-            bits, groupsize = self._get_gptq_params()
+            bits, groupsize = self._get_bits_and_groupsize()
 
             if bits != 4:
                 use_exllama = False
@@ -298,7 +298,7 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
 
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
         elif quantize == "awq":
-            bits, groupsize = self._get_gptq_params()
+            bits, groupsize = self._get_bits_and_groupsize()
 
             try:
                 qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
@@ -314,31 +314,29 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
             weight = self.get_sharded(f"{prefix}.weight", dim=1)
         return weight
 
-    def _get_gptq_params(self) -> Tuple[int, int]:
+    def _get_bits_and_groupsize(self) -> Tuple[int, int]:
         try:
-            bits = self.get_tensor("gptq_bits").item()
-            groupsize = self.get_tensor("gptq_groupsize").item()
-        except (SafetensorError, RuntimeError) as e:
+            bits = self.config.quantization_config["bits"]
+            groupsize = self.config.quantization_config["group_size"]
+        except KeyError:
+            # be compatible with old hehavior for gptq
             try:
-                bits = self.gptq_bits
-                groupsize = self.gptq_groupsize
-            except Exception:
-                raise e
+                bits = self.config.quantization_config["gptq_bits"]
+                groupsize = self.config.quantization_config["gptq_groupsize"]
+            except KeyError:
+                try:
+                    bits = self.get_tensor("gptq_bits").item()
+                    groupsize = self.get_tensor("gptq_groupsize").item()
+                except Exception as e:
+                    raise e
 
         return bits, groupsize
 
-    def _set_gptq_params(self, model_id):
-        filename = "config.json"
-        try:
-            if os.path.exists(os.path.join(model_id, filename)):
-                filename = os.path.join(model_id, filename)
-            else:
-                filename = hf_hub_download(model_id, filename=filename)
-            with open(filename, "r") as f:
-                data = json.load(f)
-            self.gptq_bits = data["quantization_config"]["bits"]
-            self.gptq_groupsize = data["quantization_config"]["group_size"]
-        except Exception:
+    def _set_config(self, model_id, config):
+        self.config = config
+
+        if not hasattr(self.config, "quantization_config"):
+            # fill from other config file
             filename = "quantize_config.json"
             try:
                 if os.path.exists(os.path.join(model_id, filename)):
@@ -347,8 +345,7 @@ def _set_gptq_params(self, model_id):
                     filename = hf_hub_download(model_id, filename=filename)
                 with open(filename, "r") as f:
                     data = json.load(f)
-                self.gptq_bits = data["bits"]
-                self.gptq_groupsize = data["group_size"]
+                self.config.quantization_config = data["quantization_config"]
             except Exception:
                 filename = "quant_config.json"
                 try:
@@ -358,8 +355,7 @@ def _set_gptq_params(self, model_id):
                         filename = hf_hub_download(model_id, filename=filename)
                     with open(filename, "r") as f:
                         data = json.load(f)
-                    self.gptq_bits = data["w_bit"]
-                    self.gptq_groupsize = data["q_group_size"]
+                    self.config.quantization_config = data["quantization_config"]
                 except Exception:
                     pass