Skip to content

Commit

Permalink
Revert "Adding Q6_0 (#77)"
Browse files Browse the repository at this point in the history
This reverts commit 1749e210d697d03f8a0d45e59257afa74b52f7f9.
  • Loading branch information
Nexesenex committed Oct 21, 2024
1 parent 3d8f5eb commit 8db50dc
Show file tree
Hide file tree
Showing 18 changed files with 4 additions and 9,193 deletions.
3 changes: 0 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1036,9 +1036,6 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
if (s == "q6_0") {
return GGML_TYPE_Q6_0;
}

throw std::runtime_error("Invalid cache type: " + s);
}
Expand Down
1 change: 0 additions & 1 deletion examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
{ "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", },
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
Expand Down
4 changes: 0 additions & 4 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,6 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
//
GGML_TYPE_Q6_0 = 133,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -443,8 +441,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
//
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
};

// available tensor operations:
Expand Down
11 changes: 0 additions & 11 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ typedef sycl::half2 ggml_half2;
#define QI5_1 (QK5_1 / (4 * QR5_1))
#define QR5_1 2

#define QI6_0 (QK6_0 / (4 * QR6_0))
#define QR6_0 2

#define QI8_0 (QK8_0 / (4 * QR8_0))
#define QR8_0 1

Expand Down Expand Up @@ -186,14 +183,6 @@ typedef struct {
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

#define QK6_0 32
typedef struct {
ggml_half d; // delta
uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
uint8_t qs[QK6_0/2]; // nibbles / quants
} block_q6_0;
static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");

#define QK8_0 32
typedef struct {
ggml_half d; // delta
Expand Down
4 changes: 0 additions & 4 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3002,7 +3002,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q6_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
Expand Down Expand Up @@ -3074,9 +3073,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q6_0) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
return true;
}
Expand Down
7 changes: 0 additions & 7 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -387,13 +387,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
static constexpr int qi = QI5_1;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
static constexpr int qk = QK6_0;
static constexpr int qr = QR6_0;
static constexpr int qi = QI6_0;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
static constexpr int qk = QK8_0;
Expand Down
41 changes: 0 additions & 41 deletions ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -122,36 +122,6 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
}
}

template<typename dst_t>
static __global__ void dequantize_block_q6_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

const int64_t i = blockIdx.x;

// assume 32 threads
const int64_t tid = threadIdx.x;
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
}

dst_t * y = yy + 256*i + 32*ir + 4*il;

const block_q6_0 * x = (const block_q6_0 *)vx + ib;
const float d = __half2float(x->d);
const float dm = -32*d;

const uint8_t * qs = x->qs + 4*il;
const uint8_t * qh = x->qh + 4*(il%2);

for (int l = 0; l < 4; ++l) {
const uint8_t h = qh[l] >> 4*(il/2);
y[l+ 0] = d * ((qs[l] & 0xF) | ((h << 4) & 0x30)) + dm;
y[l+16] = d * ((qs[l] >> 4) | ((h << 2) & 0x30)) + dm;
}
}

//================================== k-quants

template<typename dst_t>
Expand Down Expand Up @@ -527,13 +497,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q6_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
dequantize_block_q6_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
}

template<typename dst_t>
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_K;
Expand Down Expand Up @@ -635,8 +598,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
case GGML_TYPE_Q5_1:
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q6_0:
return dequantize_row_q6_0_cuda;
case GGML_TYPE_Q8_0:
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
return dequantize_block_q8_0_f16_cuda;
Expand Down Expand Up @@ -687,8 +648,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
case GGML_TYPE_Q5_1:
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q6_0:
return dequantize_row_q6_0_cuda;
case GGML_TYPE_Q8_0:
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
case GGML_TYPE_Q2_K:
Expand Down
50 changes: 0 additions & 50 deletions ggml/src/ggml-cuda/cpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -225,41 +225,6 @@ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
memcpy(dsti->qh, &qh, sizeof(qh));
}

static __device__ void cpy_blck_f32_q6_0(const char * cxi, char * cdsti) {
const float * xi = (const float *) cxi;
block_q6_0 * dsti = (block_q6_0 *) cdsti;

float amax = 0.0f;
float vmax = 0.0f;

for (int j = 0; j < QK6_0; ++j) {
const float v = xi[j];
const float av = fabsf(xi[j]);
if (amax < av) {
amax = av;
vmax = v;
}
}

const float d = vmax / -32;
const float id = d ? 1.0f/d : 0.0f;

dsti->d = d;
memset(dsti->qh, 0, QK6_0/4);

for (int j = 0; j < QK6_0/2; ++j) {
const float x0 = xi[0 + j]*id;
const float x1 = xi[QK4_0/2 + j]*id;

const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));

dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
dsti->qh[j%(QK6_0/4)] |= (h << 4*(j/(QK6_0/4)));
}
}

static __device__ const int8_t iq4nl_index[241] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
Expand Down Expand Up @@ -462,17 +427,6 @@ static void ggml_cpy_f32_q5_1_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_q6_0_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

GGML_ASSERT(ne % QK6_0 == 0);
const int num_blocks = ne / QK6_0;
cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}

static void ggml_cpy_f32_iq4_nl_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
Expand Down Expand Up @@ -545,8 +499,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
Expand Down Expand Up @@ -587,8 +539,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
Expand Down
12 changes: 0 additions & 12 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
type == GGML_TYPE_Q6_0 ? vec_dot_q6_0_q8_1 :
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
Expand All @@ -32,7 +31,6 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
type == GGML_TYPE_Q6_0 ? VDR_Q6_0_Q8_1_MMVQ :
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
Expand Down Expand Up @@ -231,13 +229,6 @@ static void mul_mat_vec_q5_1_q8_1_cuda(
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}

static void mul_mat_vec_q6_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {

mul_mat_vec_q_cuda<GGML_TYPE_Q6_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}

static void mul_mat_vec_q8_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
Expand Down Expand Up @@ -376,9 +367,6 @@ void ggml_cuda_op_mul_mat_vec_q(
case GGML_TYPE_Q5_1:
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q6_0:
mul_mat_vec_q6_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
Expand Down
44 changes: 0 additions & 44 deletions ggml/src/ggml-cuda/vecdotq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,30 +41,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
}

#define VDR_Q6_0_Q8_1_MMVQ 2
#define VDR_Q6_0_Q8_1_MMQ 4

template <int vdr> static __device__ __forceinline__ float vec_dot_q6_0_q8_1_impl(
const int * vl, const int * vh, const int * u, const float & d6, const half2 & ds8) {

int sumi = 0;

#pragma unroll
for (int i = 0; i < vdr; ++i) {
const int vi0 = ((vl[i] >> 0) & 0x0F0F0F0F) | ((vh[i] << 4) & 0x30303030);
const int vi1 = ((vl[i] >> 4) & 0x0F0F0F0F) | ((vh[i] << 2) & 0x30303030);

// SIMD dot product of quantized values
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
}

const float2 ds8f = __half22float2(ds8);

// second part effectively subtracts 8 from each quant value
return d6 * (sumi * ds8f.x - (32.f*vdr/QI6_0) * ds8f.y);
}

#define VDR_Q4_1_Q8_1_MMVQ 2
#define VDR_Q4_1_Q8_1_MMQ 4

Expand Down Expand Up @@ -566,26 +542,6 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
}

static __device__ __forceinline__ float vec_dot_q6_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {

const block_q6_0 * bq6_0 = (const block_q6_0 *) vbq + kbx;

int vl[VDR_Q6_0_Q8_1_MMVQ];
int vh[VDR_Q6_0_Q8_1_MMVQ];
int u[2*VDR_Q6_0_Q8_1_MMVQ];

#pragma unroll
for (int i = 0; i < VDR_Q6_0_Q8_1_MMVQ; ++i) {
vl[i] = get_int_b2(bq6_0->qs, iqs + i);
vh[i] = get_int_b2(bq6_0->qh, i) >> 4*(iqs/2);
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI6_0);
}

return vec_dot_q6_0_q8_1_impl<VDR_Q6_0_Q8_1_MMVQ>(vl, vh, u, bq6_0->d, bq8_1->ds);
}


static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
Expand Down
Loading

0 comments on commit 8db50dc

Please sign in to comment.