Skip to content

Commit

Permalink
cuda : build only necessary templates
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Aug 14, 2024
1 parent ae41fd2 commit 503983a
Show file tree
Hide file tree
Showing 102 changed files with 60 additions and 567 deletions.
12 changes: 0 additions & 12 deletions ggml/src/ggml-cuda/fattn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg

FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#else
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)

FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)

FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#endif // GGML_CUDA_FA_ALL_QUANTS

on_no_fattn_vec_case(Q->ne[0]);
Expand Down Expand Up @@ -283,13 +277,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg

FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#else
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)

FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)

FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#endif // GGML_CUDA_FA_ALL_QUANTS

on_no_fattn_vec_case(Q->ne[0]);
Expand Down
90 changes: 45 additions & 45 deletions ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,60 +29,60 @@ void ggml_cuda_op_mul_mat_q(
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};

switch (src0->type) {
case GGML_TYPE_Q4_0:
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
break;
// case GGML_TYPE_Q4_0:
// mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
// break;
// case GGML_TYPE_Q4_1:
// mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
// break;
// case GGML_TYPE_Q5_0:
// mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
// break;
// case GGML_TYPE_Q5_1:
// mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
// break;
case GGML_TYPE_Q8_0:
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
break;
// case GGML_TYPE_Q2_K:
// mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
// break;
// case GGML_TYPE_Q3_K:
// mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
// break;
case GGML_TYPE_Q4_K:
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
break;
// case GGML_TYPE_Q5_K:
// mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
// break;
case GGML_TYPE_Q6_K:
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
break;
case GGML_TYPE_IQ4_NL:
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break;
// case GGML_TYPE_IQ2_XXS:
// mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ2_XS:
// mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ2_S:
// mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ3_XXS:
// mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ3_S:
// mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ1_S:
// mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ4_XS:
// mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
// break;
// case GGML_TYPE_IQ4_NL:
// mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
// break;
default:
GGML_ABORT("fatal error");
break;
Expand Down
30 changes: 15 additions & 15 deletions ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2906,24 +2906,24 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
#define DECL_MMQ_CASE(type) \
template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \

extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
//extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
//extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
//extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
//extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
//extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
//extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
//extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
//extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);

// -------------------------------------------------------------------------------------------------------------------------

Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 503983a

Please sign in to comment.