ggerganov · ggerganov · Jun 23, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -1397,6 +1397,35 @@ def write_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@Model.register("BitnetForCausalLM")
+class BitnetModel(Model):
+    model_arch = gguf.MODEL_ARCH.BITNET
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+        self.gguf_writer.add_rope_scaling_factor(1.0)
+
+    def weight_quant(self, weight):
+        dtype = weight.dtype
+        weight = weight.float()
+        s = 1 / weight.abs().mean().clamp(min=1e-5)
+        result = (weight * s).round().clamp(-1, 1) / s
+        return result.type(dtype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # transform weight into 1/0/-1 (in fp32)
+        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
+                          "down_proj.weight", "up_proj.weight", "gate_proj.weight",
+                          "o_proj.weight")):
+            data_torch = data_torch + (self.weight_quant(data_torch) - data_torch).detach()
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
 @Model.register("GrokForCausalLM")
 class GrokModel(Model):
     model_arch = gguf.MODEL_ARCH.GROK

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
     { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
     { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
+    { "I2_S",   LLAMA_FTYPE_MOSTLY_I2_S,   " 2    bpw per-tensor quantization", },
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
     { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },

diff --git a/ggml-common.h b/ggml-common.h
@@ -1022,6 +1022,73 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()
 
+GGML_TABLE_BEGIN(uint32_t, i2s_i8s, 256)
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00010000, 0x01010000, 0x00010000, 0xff010000,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
+0x00000100, 0x01000100, 0x00000100, 0xff000100,
+0x00010100, 0x01010100, 0x00010100, 0xff010100,
+0x00000100, 0x01000100, 0x00000100, 0xff000100,
+0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00010000, 0x01010000, 0x00010000, 0xff010000,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
+0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
+0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
+0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
+0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
+0x00000001, 0x01000001, 0x00000001, 0xff000001,
+0x00010001, 0x01010001, 0x00010001, 0xff010001,
+0x00000001, 0x01000001, 0x00000001, 0xff000001,
+0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
+0x00000101, 0x01000101, 0x00000101, 0xff000101,
+0x00010101, 0x01010101, 0x00010101, 0xff010101,
+0x00000101, 0x01000101, 0x00000101, 0xff000101,
+0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101,
+0x00000001, 0x01000001, 0x00000001, 0xff000001,
+0x00010001, 0x01010001, 0x00010001, 0xff010001,
+0x00000001, 0x01000001, 0x00000001, 0xff000001,
+0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
+0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
+0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01,
+0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
+0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00010000, 0x01010000, 0x00010000, 0xff010000,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
+0x00000100, 0x01000100, 0x00000100, 0xff000100,
+0x00010100, 0x01010100, 0x00010100, 0xff010100,
+0x00000100, 0x01000100, 0x00000100, 0xff000100,
+0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00010000, 0x01010000, 0x00010000, 0xff010000,
+0x00000000, 0x01000000, 0x00000000, 0xff000000,
+0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
+0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
+0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
+0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
+0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
+0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
+0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
+0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
+0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
+0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
+0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff,
+0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
+0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff,
+0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
+0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
+0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
+0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
+0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
+0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff,
+0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
+0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff,
+GGML_TABLE_END()
+
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f

diff --git a/ggml-quants.c b/ggml-quants.c
@@ -659,6 +659,24 @@ static inline __m128i packNibbles( __m256i bytes ) {
 }
 #endif  //__loongarch_asx
 
+void quantize_row_i8_s(const float * x, void * y, int64_t n, float* act_scales) {
+    int8_t* dst = (int8_t*)y;
+    double min = 0.00001;
+    double max = min;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, (double)fabs((double)x[i]));
+    }
+    float s = 127 / max;
+    act_scales[0] = s;
+    float temp;
+    for (int i = 0; i < n; ++i) {
+        temp = round((double)(x[i] * s));
+        if (temp >  127) temp = 127;
+        if (temp < -128) temp = -128;
+        dst[i] = (int8_t)(temp);
+    }
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
     static const int qk = QK4_0;
@@ -3306,6 +3324,53 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+size_t quantize_i2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    // 2 bits per weight
+    UNUSED(quant_weights);
+
+    size_t row_size = ggml_row_size(GGML_TYPE_I2_S, n_per_row);
+
+    int n = nrow * n_per_row;
+
+    // f32 -> q8
+    double i2_scale = 0;
+    for (int i=0; i<n; i++) {
+        if (fabs((double)(src[i])) > 1e-6) {
+            i2_scale = (double)src[i];
+        }
+    }
+
+    uint8_t* q8 = (uint8_t*)dst;
+    for (int i=0; i<n; i++) {
+        if (fabs((double)(src[i])) < 1e-6) {
+            q8[i] = 0;
+            continue;
+        }
+        q8[i] = (double)src[i] * i2_scale > 0 ? 1 : 3;
+    }
+
+    // q8 -> 0, 1, 3
+    //       |  |  |
+    //       0, 1,-1
+
+    uint8_t* i2_weight = (uint8_t*)dst;
+    for (int i=0; i<n; i++) {
+        int group_idx = i / 4;
+        int group_pos = i % 4;
+        uint8_t temp = (q8[i] << (6 - 2 * group_pos));
+        q8[i] = 0;
+        i2_weight[group_idx] |= temp;
+    }
+
+    float* scale_ptr = (float*)((char*)i2_weight + n / 4);
+    for (int i=0; i<8; i++) {
+        scale_ptr[i] = i2_scale;
+    }
+
+    // 32B for scale
+    return nrow * row_size / 4 + 32;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
@@ -3726,6 +3791,85 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
+//====================================== I2 ===============================================
+
+void ggml_vec_dot_i2_i8_s(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const uint8_t *    restrict x = vx;
+    const int8_t  *    restrict y = vy;
+
+    UNUSED(bs);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(nrc);
+
+// TODO
+// #if defined(__AVX2__)
+//     __m256i accu = _mm256_setzero_si256();
+
+//     for (int i=0; i<n/32; i++) {
+//         const int8_t* w0 = (const int8_t *)(i2s_i8s + x[i*8 + 0]);
+//         const int8_t* w1 = (const int8_t *)(i2s_i8s + x[i*8 + 1]);
+//         const int8_t* w2 = (const int8_t *)(i2s_i8s + x[i*8 + 2]);
+//         const int8_t* w3 = (const int8_t *)(i2s_i8s + x[i*8 + 3]);
+//         const int8_t* w4 = (const int8_t *)(i2s_i8s + x[i*8 + 4]);
+//         const int8_t* w5 = (const int8_t *)(i2s_i8s + x[i*8 + 5]);
+//         const int8_t* w6 = (const int8_t *)(i2s_i8s + x[i*8 + 6]);
+//         const int8_t* w7 = (const int8_t *)(i2s_i8s + x[i*8 + 7]);
+
+//         __m256i xq8 = _mm256_set_epi8(
+//             w0[0], w0[1], w0[2], w0[3],
+//             w1[0], w1[1], w1[2], w1[3],
+//             w2[0], w2[1], w2[2], w2[3],
+//             w3[0], w3[1], w3[2], w3[3],
+//             w4[0], w4[1], w4[2], w4[3],
+//             w5[0], w5[1], w5[2], w5[3],
+//             w6[0], w6[1], w6[2], w6[3],
+//             w7[0], w7[1], w7[2], w7[3]
+//         );
+
+//         __m256i yq8 = _mm256_loadu_si256((const __m256i*)(y + i*32));
+
+//         __m128i hxq8 = _mm256_castsi256_si128(xq8);
+//         __m128i lxq8 = _mm256_extractf128_si256(xq8, 1);
+//         __m128i hyq8 = _mm256_castsi256_si128(yq8);
+//         __m128i lyq8 = _mm256_extractf128_si256(yq8, 1);
+
+//         __m256i hxq16 = _mm256_cvtepi8_epi16(hxq8);
+//         __m256i lxq16 = _mm256_cvtepi8_epi16(lxq8);
+//         __m256i hyq16 = _mm256_cvtepi8_epi16(hyq8);
+//         __m256i lyq16 = _mm256_cvtepi8_epi16(lyq8);
+
+//         __m256i hzq16 = _mm256_sign_epi16(hyq16, hxq16);
+//         __m256i lzq16 = _mm256_sign_epi16(lyq16, lxq16);
+
+//         __m256i hhzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(hzq16));
+//         __m256i hlzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(hzq16, 1));
+//         __m256i llzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(lzq16));
+//         __m256i lhzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(lzq16, 1));
+
+//         accu = _mm256_add_epi32(accu, hhzq32);
+//         accu = _mm256_add_epi32(accu, hlzq32);
+//         accu = _mm256_add_epi32(accu, llzq32);
+//         accu = _mm256_add_epi32(accu, lhzq32);
+//     }
+
+//     int sumi = hsum_i32_8(accu);
+//     *s = (float)sumi;
+// #else
+
+    int sumi = 0;
+
+    for (int i = 0; i < n / 4; i++) {
+        const int8_t* weight = (const int8_t *)(i2s_i8s + x[i]);
+        sumi += (int)y[i*4+0] * weight[0];
+        sumi += (int)y[i*4+1] * weight[1];
+        sumi += (int)y[i*4+2] * weight[2];
+        sumi += (int)y[i*4+3] * weight[3];
+    }
+    *s = (float)sumi;
+// #endif
+}
+
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -14367,6 +14511,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
         case GGML_TYPE_I64:
+        case GGML_TYPE_I2_S:
             // nothing to validate
             break;
         default:

diff --git a/ggml-quants.h b/ggml-quants.h
@@ -51,6 +51,7 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_i8_s   (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k, float* n);
 
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -99,6 +100,7 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_i2_i8_s     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -121,6 +123,7 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_i2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);