renmae

ggerganov · Nov 21, 2023 · 42cd49c · 42cd49c
1 parent 05ac2ac
commit 42cd49c
Show file tree

Hide file tree

Showing 9 changed files with 384 additions and 171 deletions.
diff --git a/Makefile b/Makefile
@@ -116,7 +116,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -538,16 +538,16 @@ $(info )
 #
 
 ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 

diff --git a/ggml-alloc.c → ggml-alloc.cpp b/ggml-alloc.c → ggml-alloc.cpp
diff --git a/ggml-backend.c → ggml-backend.cpp b/ggml-backend.c → ggml-backend.cpp
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -7623,12 +7623,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #endif
 
     // debug helpers
-    printf("JSON { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
-	   src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
-	   ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
-	   src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
-	   dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
-	   );
+           // printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
+	   // src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+	   // ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
+	   // src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
+	   // dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
+	   // );
 
     if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch

diff --git a/ggml-impl.h b/ggml-impl.h
@@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 

diff --git a/ggml-mpi.c → ggml-mpi.cpp b/ggml-mpi.c → ggml-mpi.cpp
diff --git a/ggml-quants.c → ggml-quants.cpp b/ggml-quants.c → ggml-quants.cpp
diff --git a/ggml-quants.h b/ggml-quants.h
@@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 
 
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
+
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
+
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
+
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
 
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
+
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
 
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);