From 57b79fda88f95ef953cf90c1ec74b48ef5252b78 Mon Sep 17 00:00:00 2001
From: chentyjpm <317974925@qq.com>
Date: Wed, 14 Aug 2024 14:34:47 +0800
Subject: [PATCH 1/2] add hf2gguf conv format of q4_0 q4_1 q5_0 q5_1

---
 convert_hf_to_gguf.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 550dd5cfda99f..74f2f5abff9c1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -311,6 +311,14 @@ def prepare_tensors(self):
                         data_qtype = gguf.GGMLQuantizationType.BF16
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                         data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
+                        data_qtype = gguf.GGMLQuantizationType.Q4_0      
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
+                        data_qtype = gguf.GGMLQuantizationType.Q4_1      
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_0      
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
+                        data_qtype = gguf.GGMLQuantizationType.Q5_1                                                
                     else:
                         raise ValueError(f"Unknown file type: {self.ftype.name}")
 
@@ -3815,8 +3823,10 @@ def parse_args() -> argparse.Namespace:
         help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, "
+                "q8_0 for Q8_0, q4_0 for Q4_0, q4_1 for Q4_1, q5_0 for Q5_0, q5_1 for Q5_1,"
+                " auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian", action="store_true",
@@ -3903,6 +3913,10 @@ def main() -> None:
         "f16": gguf.LlamaFileType.MOSTLY_F16,
         "bf16": gguf.LlamaFileType.MOSTLY_BF16,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
+        "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
+        "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
+        "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
         "auto": gguf.LlamaFileType.GUESSED,
     }
 

From 7d261a9f96e0e2c137bf5f382ab699e0e421e70d Mon Sep 17 00:00:00 2001
From: chentyjpm <317974925@qq.com>
Date: Thu, 15 Aug 2024 14:20:07 +0800
Subject: [PATCH 2/2] add warning code when quantizing to Q4_0, Q4_1, Q5_0, or
 Q5_1

---
 convert_hf_to_gguf.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 74f2f5abff9c1..33c62312f5de6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -322,6 +322,19 @@ def prepare_tensors(self):
                     else:
                         raise ValueError(f"Unknown file type: {self.ftype.name}")
 
+                if data_qtype in [
+                    gguf.GGMLQuantizationType.Q5_1, gguf.LlamaFileType.MOSTLY_Q5_0,
+                    gguf.GGMLQuantizationType.Q4_1, gguf.LlamaFileType.MOSTLY_Q4_0,
+                    ]:
+                    logger.warning("\n")
+                    logger.warning("**************************************************************************************")
+                    logger.warning("** WARNING: when quantizing to `Q4_0`, `Q4_1`, `Q5_0`, or `Q5_1`")
+                    logger.warning("**          is not equivalent to using `llama-quantize`")
+                    logger.warning("**          `llama-quantize` uses `Q4_K` and `Q6_K` for the token embeddings")
+                    logger.warning("**          but this code not")
+                    logger.warning("**************************************************************************************")
+                    logger.warning("\n")
+
                 try:
                     data = gguf.quants.quantize(data, data_qtype)
                 except gguf.QuantError as e:
@@ -3825,7 +3838,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
         help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, "
-                "q8_0 for Q8_0, q4_0 for Q4_0, q4_1 for Q4_1, q5_0 for Q5_0, q5_1 for Q5_1,"
+                "q8_0 for Q8_0, limited: q4_0 for Q4_0, q4_1 for Q4_1, q5_0 for Q5_0, q5_1 for Q5_1,"
                 " auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(