From 4ee1e398273d63d5a6a9554d89eeabb784568f36 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Tue, 7 May 2024 22:32:28 -0700 Subject: [PATCH] Release llamafile v0.8.2 - Upgrade to cosmocc 3.3.6 - Remove warnings from cuda build - Fix bug in llamafile_trapping_enabled - Refactor the new vectorized expf() code - iqk_mul_mat() only needs codegen for AVX2 - Be less gung ho about the -ngl flag in README - Restore shell scriptabiilty fix for new tokenizer - Suppress divide by zero errors llama_print_timings() - Cut back on tinyBLAS CPU multiple output type kernels - Cut back NVIDIA fat binary releases to -arch=all-major - Remove GA (won't rely on slow broken irregular cloud dev tools) - Cut flash_attn_ext from release binaries (use --recompile to have it) --- .github/workflows/ci.yml | 42 ---- README.md | 12 +- build/config.mk | 7 +- llama.cpp/ggml-cuda.cu | 39 +++- llama.cpp/ggml-vector.inc | 208 ++++++++++++------ llama.cpp/ggml.c | 5 +- llama.cpp/llama.cpp | 4 + llama.cpp/llava/clip.cpp | 3 +- llama.cpp/llava/llava-cli.cpp | 3 +- llama.cpp/main/embedding.cpp | 1 - llama.cpp/main/main.cpp | 11 +- llamafile/BUILD.mk | 5 + llamafile/cuda.bat | 8 +- llamafile/cuda.sh | 11 +- llamafile/expf.h | 146 ------------ llamafile/get_app_dir.c | 10 +- .../{iqk_mul_mat.inc => iqk_mul_mat.cpp} | 15 +- llamafile/rocm.bat | 1 + llamafile/rocm.sh | 5 +- llamafile/sgemm.h | 2 + llamafile/tinyblas_cpu_mixmul.inc | 4 - llamafile/tinyblas_cpu_sgemm.inc | 20 +- llamafile/v_expf.c | 145 ------------ llamafile/version.h | 2 +- 24 files changed, 242 insertions(+), 467 deletions(-) delete mode 100644 .github/workflows/ci.yml delete mode 100644 llamafile/expf.h rename llamafile/{iqk_mul_mat.inc => iqk_mul_mat.cpp} (99%) delete mode 100644 llamafile/v_expf.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index ad99cb348f..0000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: CI -on: - push: - branches: [ master, main, fix ] - pull_request: - branches: [ master, main, fix ] -jobs: - Tests: - timeout-minutes: 60 - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ macos-latest ] # ubuntu-latest, windows-latest are currently non-functional, requiring adaptation for proper functionality. - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - name: Build - shell: bash - run: | - curl -L -o make https://cosmo.zip/pub/cosmos/bin/make - chmod +x make - ./make -j8 - - name: Make Llamafile - shell: bash - run: | - curl -L -o tinyllama.gguf https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q4_0.gguf - cat << EoF > .args - -m - tinyllama.gguf - ... - EoF - cp o//llama.cpp/main/main \ - tinyllama.llamafile - o//llamafile/zipalign -j0 \ - tinyllama.llamafile \ - tinyllama.gguf \ - .args - - name: Execute LLM CLI CPU # GA doesn't have "support_simdgroup_reduction" for RMS_NORM :'( - shell: bash - run: | - ./tinyllama.llamafile -e -p '## Famous Speech\n\nFour score and seven' -n 50 -ngl 0 diff --git a/README.md b/README.md index b38175d497..9e9a000cda 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ chmod +x llava-v1.5-7b-q4.llamafile 5. Run the llamafile. e.g.: ```sh -./llava-v1.5-7b-q4.llamafile -ngl 9999 +./llava-v1.5-7b-q4.llamafile ``` 6. Your browser should open automatically and display a chat interface. @@ -184,19 +184,19 @@ try out llamafile with different kinds of LLMs. Here is an example for the Mistral command-line llamafile: ```sh -./mistral-7b-instruct-v0.2.Q5_K_M.llamafile -ngl 9999 --temp 0.7 -p '[INST]Write a story about llamas[/INST]' +./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --temp 0.7 -p '[INST]Write a story about llamas[/INST]' ``` And here is an example for WizardCoder-Python command-line llamafile: ```sh -./wizardcoder-python-13b.llamafile -ngl 9999 --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n' +./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n' ``` And here's an example for the LLaVA command-line llamafile: ```sh -./llava-v1.5-7b-q4.llamafile -ngl 9999 --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:' +./llava-v1.5-7b-q4.llamafile --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:' ``` As before, macOS, Linux, and BSD users will need to use the "chmod" @@ -266,7 +266,7 @@ For Windows users, here's an example for the Mistral LLM: ```sh curl -L -o llamafile.exe https://github.com/Mozilla-Ocho/llamafile/releases/download/0.6/llamafile-0.6 curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -./llamafile.exe -m mistral.gguf -ngl 9999 +./llamafile.exe -m mistral.gguf ``` Windows users may need to change `./llamafile.exe` to `.\llamafile.exe` @@ -423,7 +423,7 @@ llama.cpp command line interface, utilizing WizardCoder-Python-13B weights: ```sh -llamafile -ngl 9999 \ +llamafile \ -m wizardcoder-python-13b-v1.0.Q8_0.gguf \ --temp 0 -r '}\n' -r '```\n' \ -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n' diff --git a/build/config.mk b/build/config.mk index e08baa3248..8770dfe28e 100644 --- a/build/config.mk +++ b/build/config.mk @@ -2,7 +2,7 @@ #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘ PREFIX = /usr/local -COSMOCC = .cosmocc/3.3.5 +COSMOCC = .cosmocc/3.3.6 TOOLCHAIN = $(COSMOCC)/bin/cosmo AR = $(TOOLCHAIN)ar @@ -50,6 +50,7 @@ clean:; rm -rf o .PHONY: distclean distclean:; rm -rf o .cosmocc -.cosmocc/3.3.5: - build/download-cosmocc.sh $@ 3.3.5 db78fd8d3f8706e9dff4be72bf71d37a3f12062f212f407e1c33bc4af3780dd0 +.cosmocc/3.3.6: + build/download-cosmocc.sh $@ 3.3.6 26e3449357f31b82489774ef5c2d502a711bb711d4faf99a5fd6c96328a1c205 + diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index d55a587fc5..3020013c58 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -212,7 +212,6 @@ #include "ggml-backend-impl.h" static const struct ggml_backend_api *g_backend; -#define exit g_backend->exit #define getenv g_backend->getenv #define FLAG_log_disable (*g_backend->FLAG_log_disable) #define ggml_backend_register g_backend->ggml_backend_register @@ -242,6 +241,18 @@ static const struct ggml_backend_api *g_backend; #define ggml_is_empty g_backend->ggml_is_empty #define ggml_op_desc g_backend->ggml_op_desc +[[noreturn]] +static void exit_(int rc) { + g_backend->exit(rc); +#define exit exit_ +#if defined(__GNUC__) || defined(__llvm__) + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(0); +#endif + for (;;); +} + // printf() and fprintf() runtime bridge // this is needed so text gets printed on windows // it also helps ensure the atomicity of log lines @@ -484,6 +495,14 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX } +#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \ + defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL) +#define FP16_MMA_AVAILABLE (!(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA) +#if FP16_MMA_AVAILABLE +#include +#endif + +#if defined(GGML_MINIMIZE_CODE_SIZE) && FP16_AVAILABLE // [jart] static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll @@ -496,6 +515,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { NO_DEVICE_CODE; #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } +#endif // [jart] #if CUDART_VERSION < CUDART_HMASK static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { @@ -588,15 +608,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { } #endif // defined(GGML_USE_HIPBLAS) -#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \ - defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL - -#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA - -#if FP16_MMA_AVAILABLE -#include -#endif - // TODO: move to ggml-common.h static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; @@ -823,7 +834,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); +#ifndef GGML_MINIMIZE_CODE_SIZE // [jart] void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +#endif #define CUDA_GET_ROWS_BLOCK_SIZE 256 @@ -5785,6 +5798,7 @@ template void launch_ launch_fattn_f16_impl(Q, K, V, KQV, mask, pool, main_stream); } +#ifndef GGML_MINIMIZE_CODE_SIZE // [jart] void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; @@ -5966,6 +5980,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst } return; } +#endif // GGML_MINIMIZE_CODE_SIZE [jart] template static __global__ void k_get_rows( @@ -12501,9 +12516,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_ARGSORT: ggml_cuda_op_argsort(ctx, dst); break; +#ifndef GGML_MINIMIZE_CODE_SIZE // [jart] case GGML_OP_FLASH_ATTN_EXT: ggml_cuda_flash_attn_ext(ctx, dst); break; +#endif default: return false; } @@ -12778,7 +12795,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: +#ifndef GGML_MINIMIZE_CODE_SIZE // [jart] case GGML_OP_FLASH_ATTN_EXT: +#endif return true; default: return false; diff --git a/llama.cpp/ggml-vector.inc b/llama.cpp/ggml-vector.inc index ead2de864a..58e25dce64 100644 --- a/llama.cpp/ggml-vector.inc +++ b/llama.cpp/ggml-vector.inc @@ -3,7 +3,6 @@ #include "ggml-vector.h" #include "ggml-impl.h" -#include "llamafile/expf.h" #include "llamafile/llamafile.h" #include @@ -1185,36 +1184,157 @@ void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { } } -void ggml_vec_silu_f32(const int n, float * y, const float * x) { - int i = 0; +#if defined(__ARM_NEON) -#if defined(__AVX512F__) - if (!FLAG_trap && !FLAG_precise) { - for (; i + 15 < n; i += 16) { - _mm512_storeu_ps(y + i, llamafile_silu_avx512(_mm512_loadu_ps(x + i))); - } - } +// adapted from arm limited optimized routine +// by justine tunney on the llamafile project +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static float32x4_t ggml_v_expf(float32x4_t x) { + const float32x4_t r = vdupq_n_f32(0x1.8p23f); + const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); + const float32x4_t n = vsubq_f32(z, r); + const float32x4_t b = + vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, vdupq_n_f32(0x1.7f7d1cp-20f)); + const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); + const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vdupq_n_u32(0x3f800000))); + const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); + const float32x4_t u = vmulq_f32(b, b); + const float32x4_t j = vfmaq_f32( + vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), + vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), + vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), + u); + if (!vpaddd_u64(vreinterpretq_u64_u32(c))) + return vfmaq_f32(k, j, k); + const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); + const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); + const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); + return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), + vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static float32x4_t ggml_v_silu(float32x4_t x) { + const float32x4_t one = vdupq_n_f32(1.0f); + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t neg_x = vsubq_f32(zero, x); + const float32x4_t exp_neg_x = ggml_v_expf(neg_x); + const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); + return vdivq_f32(x, one_plus_exp_neg_x); +} + +#elif defined(__SSE2__) // __ARM_NEON + +#if defined(__FMA__) +#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) +#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) +#else +#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) +#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) #endif -#if defined(__AVX2__) && defined(__FMA__) - if (!FLAG_trap && !FLAG_precise) { - for (; i + 7 < n; i += 8) { - _mm256_storeu_ps(y + i, llamafile_silu_avx2fma(_mm256_loadu_ps(x + i))); - } +// adapted from arm limited optimized routine +// by justine tunney on the llamafile project +// the maximum error is 1.45358 plus 0.5 ulps +// numbers above 88.38 will flush to infinity +// numbers beneath -103.97 will flush to zero +inline static __m128 ggml_v_expf(__m128 x) { + const __m128 r = _mm_set1_ps(0x1.8p23f); + const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); + const __m128 n = _mm_sub_ps(z, r); + const __m128 b = + NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); + const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); + const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); + const __m128i c = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); + const __m128 u = _mm_mul_ps(b, b); + const __m128 j = + MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, + MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), + u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); + if (!_mm_movemask_epi8(c)) + return MADD128(j, k, k); + const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), + _mm_set1_epi32(0x82000000u)); + const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); + const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); + const __m128i d = + _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); + return _mm_or_ps( + _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), + _mm_andnot_ps(_mm_castsi128_ps(d), + _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), + _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); +} + +// computes silu x/(1+exp(-x)) in single precision vector +inline static __m128 ggml_v_silu(__m128 x) { + const __m128 one = _mm_set1_ps(1); + const __m128 zero = _mm_setzero_ps(); + const __m128 neg_x = _mm_sub_ps(zero, x); + const __m128 exp_neg_x = ggml_v_expf(neg_x); + const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); + return _mm_div_ps(x, one_plus_exp_neg_x); +} + +#endif // __ARM_NEON / __SSE2__ + +void ggml_vec_silu_f32(const int n, float * y, const float * x) { + int i = 0; + if (!FLAG_trap) { +#if defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); + } +#elif defined(__ARM_NEON) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); } #endif + } + for (; i < n; ++i) { + y[i] = ggml_silu_f32(x[i]); + } +} -#if defined(__ARM_NEON) +float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { + int i = 0; + ggml_float sum = 0; if (!FLAG_trap) { - for (; i + 3 < n; i += 4) { - vst1q_f32(y + i, llamafile_silu_neon(vld1q_f32(x + i))); - } +#if defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(max))); + _mm_storeu_ps(y + i, val); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); +#else + __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); + val = _mm_add_ps(val, tmp); + tmp = _mm_movehl_ps(tmp, val); + val = _mm_add_ss(val, tmp); +#endif + sum += (ggml_float)_mm_cvtss_f32(val); + } +#elif defined(__ARM_NEON) + for (; i + 3 < n; i += 4) { + float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), + vdupq_n_f32(max))); + vst1q_f32(y + i, val); + sum += vaddvq_f32(val); } #endif - + } for (; i < n; ++i) { - y[i] = ggml_silu_f32(x[i]); + float val = expf(x[i] - max); + sum += (ggml_float)val; + y[i] = val; } + return sum; } float ggml_silu_backward_f32(float x, float dy) { @@ -1282,51 +1402,3 @@ void ggml_vec_argmax_f32(const int n, int * s, const float * x) { } *s = idx; } - -float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { - int i = 0; - ggml_float sum = 0; - -#if defined(__AVX512F__) - if (!FLAG_trap && !FLAG_precise) { - for (; i + 15 < n; i += 16) { - __m512 val = llamafile_expf_avx512(_mm512_sub_ps(_mm512_loadu_ps(x + i), - _mm512_set1_ps(max))); - _mm512_storeu_ps(y + i, val); - sum += _mm512_reduce_add_ps(val); - } - } -#endif - -#if defined(__AVX2__) && defined(__FMA__) - if (!FLAG_trap && !FLAG_precise) { - for (; i + 7 < n; i += 8) { - __m256 val = llamafile_expf_avx2fma(_mm256_sub_ps(_mm256_loadu_ps(x + i), - _mm256_set1_ps(max))); - _mm256_storeu_ps(y + i, val); - __m128 v2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), - _mm256_castps256_ps128(val)); - v2 = _mm_add_ps(v2, _mm_movehl_ps(v2, v2)); - v2 = _mm_add_ss(v2, _mm_movehdup_ps(v2)); - sum += _mm_cvtss_f32(v2); - } - } -#endif - -#ifdef __ARM_NEON - if (!FLAG_trap) { - for (; i + 3 < n; i += 4) { - float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max))); - vst1q_f32(y + i, val); - sum += vaddvq_f32(val); - } - } -#endif - - for (; i < n; ++i) { - float val = expf(x[i] - max); - sum += (ggml_float)val; - y[i] = val; - } - return sum; -} diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c index 83b3b14bef..a90a4ac9da 100644 --- a/llama.cpp/ggml.c +++ b/llama.cpp/ggml.c @@ -17756,6 +17756,7 @@ struct ggml_compute_state { int ith; struct ggml_compute_state_shared * shared; enum ggml_status ec; + bool is_main_thread; // [jart] }; static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { @@ -18064,7 +18065,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { int task_phase = GGML_TASK_TYPE_FINALIZE; #ifdef LLAMAFILE_DEBUG - if (FLAG_trap) { + if (FLAG_trap && !state->is_main_thread) { llamafile_trapping_enabled(+1); } #endif @@ -18522,6 +18523,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl .ith = j, .shared = &state_shared, .ec = GGML_STATUS_SUCCESS, + .is_main_thread = false, // [jart] }; const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); @@ -18533,6 +18535,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl workers[0].ith = 0; workers[0].shared = &state_shared; workers[0].ec = GGML_STATUS_SUCCESS; + workers[0].is_main_thread = true; // [jart] const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp index 7d7288aaf9..3b341e5aef 100644 --- a/llama.cpp/llama.cpp +++ b/llama.cpp/llama.cpp @@ -2,6 +2,7 @@ // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi #define LLAMA_API_INTERNAL #include "llamafile/log.h" +#include "llamafile/debug.h" #include "llama.h" #include "unicode.h" @@ -17661,6 +17662,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); + llamafile_trapping_enabled(-1); // [jart] LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); @@ -17671,6 +17673,8 @@ void llama_print_timings(struct llama_context * ctx) { LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); + + llamafile_trapping_enabled(+1); // [jart] } void llama_reset_timings(struct llama_context * ctx) { diff --git a/llama.cpp/llava/clip.cpp b/llama.cpp/llava/clip.cpp index dc7e0fb5fc..fdcc8b93cf 100644 --- a/llama.cpp/llava/clip.cpp +++ b/llama.cpp/llava/clip.cpp @@ -139,7 +139,8 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - LOG_TEE("key %s not found in file\n", key); + // [jart] don't log to console errors that aren't errors + LOG("%s: note: key %s not found in file\n", __func__, key); throw std::runtime_error(format("Missing required key: %s", key)); } diff --git a/llama.cpp/llava/llava-cli.cpp b/llama.cpp/llava/llava-cli.cpp index 57621ae6a9..1df2af9a7d 100644 --- a/llama.cpp/llava/llava-cli.cpp +++ b/llama.cpp/llava/llava-cli.cpp @@ -296,14 +296,13 @@ int llava_cli(int argc, char ** argv, gpt_params & params) { #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); log_dump_cmdline(argc, argv); llama_log_set(llama_log_callback_logTee, nullptr); #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { gpt_print_usage(argc, argv, params); - show_additional_info(argc, argv); + // show_additional_info(argc, argv); // [jart] no help unless we ask for it return 1; } auto model = llava_init(¶ms); diff --git a/llama.cpp/main/embedding.cpp b/llama.cpp/main/embedding.cpp index 4ce980e324..583f88152d 100644 --- a/llama.cpp/main/embedding.cpp +++ b/llama.cpp/main/embedding.cpp @@ -78,7 +78,6 @@ int embedding_cli(int argc, char ** argv) { #ifndef LOG_DISABLE_LOGS log_set_target(stderr); - LOG_TEE("Log start\n"); log_dump_cmdline(argc, argv); llama_log_set(llama_log_callback_logTee, nullptr); #endif // LOG_DISABLE_LOGS diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp index 9e04e2b04b..4c0eb97d24 100644 --- a/llama.cpp/main/main.cpp +++ b/llama.cpp/main/main.cpp @@ -179,7 +179,6 @@ int main(int argc, char ** argv) { #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("main", "log")); - LOG_TEE("Log start\n"); log_dump_cmdline(argc, argv); llama_log_set(llama_log_callback_logTee, nullptr); #endif // LOG_DISABLE_LOGS @@ -206,7 +205,7 @@ int main(int argc, char ** argv) { } __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM; if (pledge(0, 0)) { - LOG_TEE("warning: this OS doesn't support pledge() security\n"); + LOG("warning: this OS doesn't support pledge() security\n"); } else if (pledge(promises, 0)) { perror("pledge"); exit(1); @@ -594,6 +593,7 @@ int main(int argc, char ** argv) { antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); } + bool should_show_special_tokens = sparams.grammar.empty(); // [jart] for shell scriptability struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); if (!ctx_sampling) { // [jart] fixes crash fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); @@ -812,7 +812,12 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + + // [jart] shell scriptability + const std::string token_str = + llama_token_to_piece( + ctx, id, should_show_special_tokens); + printf("%s", token_str.c_str()); if (embd.size() > 1) { diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk index a6114ef45c..58b272b16c 100644 --- a/llamafile/BUILD.mk +++ b/llamafile/BUILD.mk @@ -89,6 +89,7 @@ o/$(MODE)/llamafile: \ # o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os +o/$(MODE)/llamafile/iqk_mul_mat.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma @@ -134,6 +135,10 @@ o/$(MODE)/llamafile/tinyblas_test: \ o/$(MODE)/llamafile/tester.o build/cudacc -g -o $@ $^ -lcublas +o/$(MODE)/llamafile/compcap: \ + o/$(MODE)/llamafile/compcap.o + build/cudacc -g -o $@ $^ -lcublas + o/$(MODE)/llamafile/cudaprops: \ o/$(MODE)/llamafile/cudaprops.o \ o/$(MODE)/llamafile/tester.o diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat index 4f1997eb9f..4dd5b6d6bc 100644 --- a/llamafile/cuda.bat +++ b/llamafile/cuda.bat @@ -5,12 +5,7 @@ :: driver on a Windows system that has a CUDA-capable GPU installed. nvcc --shared ^ - -gencode=arch=compute_50,code=sm_50 ^ - -gencode=arch=compute_60,code=sm_60 ^ - -gencode=arch=compute_70,code=sm_70 ^ - -gencode=arch=compute_75,code=sm_75 ^ - -gencode=arch=compute_80,code=sm_80 ^ - -gencode=arch=compute_90,code=sm_90 ^ + -arch=all-major ^ --forward-unknown-to-host-compiler ^ -Xcompiler="/nologo /EHsc /O2 /GR /MT" ^ -DNDEBUG ^ @@ -21,6 +16,7 @@ nvcc --shared ^ -DGGML_CUDA_DMMV_X=32 ^ -DK_QUANTS_PER_ITERATION=2 ^ -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 ^ + -DGGML_MINIMIZE_CODE_SIZE ^ -DGGML_USE_TINYBLAS ^ -o ggml-cuda.dll ^ ggml-cuda.cu ^ diff --git a/llamafile/cuda.sh b/llamafile/cuda.sh index 99af991b0d..fc4d0c3bfb 100755 --- a/llamafile/cuda.sh +++ b/llamafile/cuda.sh @@ -23,15 +23,9 @@ cd "$TMP" /usr/local/cuda/bin/nvcc \ --shared \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_75,code=sm_75 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_90,code=sm_90 \ + -arch=all-major \ --forward-unknown-to-host-compiler \ - --compiler-options \ - "-fPIC -O3 -march=native -mtune=native" \ + --compiler-options "-fPIC -O2" \ -DNDEBUG \ -DGGML_BUILD=1 \ -DGGML_SHARED=1 \ @@ -40,6 +34,7 @@ cd "$TMP" -DGGML_CUDA_DMMV_X=32 \ -DK_QUANTS_PER_ITERATION=2 \ -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ + -DGGML_MINIMIZE_CODE_SIZE \ -DGGML_USE_TINYBLAS \ -o ~/ggml-cuda.so \ ggml-cuda.cu \ diff --git a/llamafile/expf.h b/llamafile/expf.h deleted file mode 100644 index 200829546f..0000000000 --- a/llamafile/expf.h +++ /dev/null @@ -1,146 +0,0 @@ -// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi -// -// Copyright 2023 Mozilla Foundation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef __AVX512F__ -#include - -// computes expf() for each element in vector. -// -// the maximum error is 1.45358 +0.5 ulp. the only difference between -// this function and expf(), is that there's currently no support for -// subnormals. input values are clamped to range: [-87.6831, 88.3763] -// whereas expf() allows inputs as low as -103.972. therefore numbers -// will be flushed to zero sooner than they otherwise would with this -// function. nearest rounding mode is always used. exception trapping -// isn't supported although this function does a good job avoiding it -// -static inline __m512 llamafile_expf_avx512(__m512 x) { - __m512 a, b, c, d, e, f, g; - __m512 will_turn_into_inf = _mm512_set1_ps(0x1.62e44p+6f); - __m512 max_before_overflow = _mm512_set1_ps(0x1.61814cp+6f); - __m512 min_before_underflow = _mm512_set1_ps(-0x1.5ebb86p+6f); - x = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(x, max_before_overflow, _CMP_GE_OQ), x, - will_turn_into_inf); - x = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(x, min_before_underflow, _CMP_LE_OQ), x, - min_before_underflow); - a = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.715476p+0f), x, _mm512_set1_ps(0x1.8p23f), - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - b = _mm512_sub_ps(a, _mm512_set1_ps(0x1.8p23f)); - c = _mm512_fnmadd_round_ps(_mm512_set1_ps(0x1.62e4p-1f), b, x, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - c = _mm512_fnmadd_round_ps(_mm512_set1_ps(0x1.7f7d1cp-20f), b, c, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - d = _mm512_castsi512_ps(_mm512_add_epi32(_mm512_slli_epi32(_mm512_castps_si512(a), 23), - _mm512_set1_epi32(0x3f800000u))); - e = _mm512_mul_round_ps(c, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - f = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.0e4020p-7f), c, _mm512_set1_ps(0x1.573e2ep-5f), - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - g = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.555e66p-3f), c, _mm512_set1_ps(0x1.fffdb6p-2f), - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - g = _mm512_fmadd_round_ps(f, e, g, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - f = _mm512_mul_round_ps(_mm512_set1_ps(0x1.ffffecp-1f), c, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - return _mm512_fmadd_round_ps( - _mm512_fmadd_round_ps(g, e, f, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), d, d, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); -} - -// computes silu x/(1+exp(-x)) in single precision -static inline __m512 llamafile_silu_avx512(__m512 x) { - __m512 one = _mm512_set1_ps(1); - __m512 zero = _mm512_setzero_ps(); - __m512 neg_x = _mm512_sub_ps(zero, x); - __m512 exp_neg_x = llamafile_expf_avx512(neg_x); - __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); - return _mm512_div_ps(x, one_plus_exp_neg_x); -} - -#endif // __AVX512F__ - -#if defined(__AVX2__) && defined(__FMA__) -#include - -// computes expf() for each element in vector. -// -// the maximum error is 1.45358 +0.5 ulp. the only difference between -// this function and expf(), is that there's currently no support for -// subnormals. input values are clamped to range: [-87.6831, 88.3763] -// whereas expf() allows inputs as low as -103.972. therefore numbers -// will be flushed to zero sooner than they otherwise would with this -// function. exception trapping isnt supported although this function -// does a good job avoiding it. -// -static inline __m256 llamafile_expf_avx2fma(__m256 x) { - __m256 a, b, c, d, e, f, g; - __m256 will_turn_into_inf = _mm256_set1_ps(0x1.62e44p+6f); - __m256 max_before_overflow = _mm256_set1_ps(0x1.61814cp+6f); - __m256 min_before_underflow = _mm256_set1_ps(-0x1.5ebb86p+6f); - __m256 min_mask = _mm256_cmp_ps(x, min_before_underflow, _CMP_LE_OQ); - __m256 max_mask = _mm256_cmp_ps(x, max_before_overflow, _CMP_GE_OQ); - x = _mm256_or_ps(_mm256_and_ps(min_mask, min_before_underflow), _mm256_andnot_ps(min_mask, x)); - x = _mm256_or_ps(_mm256_and_ps(max_mask, will_turn_into_inf), _mm256_andnot_ps(max_mask, x)); - a = _mm256_fmadd_ps(_mm256_set1_ps(0x1.715476p+0f), x, _mm256_set1_ps(0x1.8p23f)); - b = _mm256_sub_ps(a, _mm256_set1_ps(0x1.8p23f)); - c = _mm256_fnmadd_ps(_mm256_set1_ps(0x1.62e4p-1f), b, x); - c = _mm256_fnmadd_ps(_mm256_set1_ps(0x1.7f7d1cp-20f), b, c); - d = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_slli_epi32(_mm256_castps_si256(a), 23), - _mm256_set1_epi32(0x3f800000u))); - e = _mm256_mul_ps(c, c); - f = _mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), c, _mm256_set1_ps(0x1.573e2ep-5f)); - g = _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), c, _mm256_set1_ps(0x1.fffdb6p-2f)); - g = _mm256_fmadd_ps(f, e, g); - f = _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), c); - return _mm256_fmadd_ps(_mm256_fmadd_ps(g, e, f), d, d); -} - -// computes silu x/(1+exp(-x)) in single precision -static inline __m256 llamafile_silu_avx2fma(__m256 x) { - __m256 one = _mm256_set1_ps(1); - __m256 zero = _mm256_setzero_ps(); - __m256 neg_x = _mm256_sub_ps(zero, x); - __m256 exp_neg_x = llamafile_expf_avx2fma(neg_x); - __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); - return _mm256_div_ps(x, one_plus_exp_neg_x); -} - -#endif // __AVX2__ - -#ifdef __ARM_NEON -#include - -float32x4_t v_expf(float32x4_t); - -static inline float32x4_t llamafile_expf_neon(float32x4_t x) { - return v_expf(x); -} - -static inline float32x4_t llamafile_silu_neon(float32x4_t x) { - float32x4_t one = vdupq_n_f32(1.0f); - float32x4_t zero = vdupq_n_f32(0.0f); - float32x4_t neg_x = vsubq_f32(zero, x); - float32x4_t exp_neg_x = llamafile_expf_neon(neg_x); - float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); - return vdivq_f32(x, one_plus_exp_neg_x); -} - -#endif - -#ifdef __cplusplus -} -#endif diff --git a/llamafile/get_app_dir.c b/llamafile/get_app_dir.c index 957473d914..cb95afd610 100644 --- a/llamafile/get_app_dir.c +++ b/llamafile/get_app_dir.c @@ -16,12 +16,20 @@ // limitations under the License. #include "llamafile.h" +#include #include +static const char *llamafile_get_home_dir(void) { + const char *homedir; + if (!(homedir = getenv("HOME")) || !*homedir) + homedir = "."; + return homedir; +} + /** * Returns path of directory for app-specific files. */ void llamafile_get_app_dir(char *path, size_t size) { - strlcpy(path, llamafile_get_tmp_dir(), size); + strlcpy(path, llamafile_get_home_dir(), size); strlcat(path, "/.llamafile/", size); } diff --git a/llamafile/iqk_mul_mat.inc b/llamafile/iqk_mul_mat.cpp similarity index 99% rename from llamafile/iqk_mul_mat.inc rename to llamafile/iqk_mul_mat.cpp index 2ffa2b5e41..112544d7a3 100644 --- a/llamafile/iqk_mul_mat.inc +++ b/llamafile/iqk_mul_mat.cpp @@ -14,7 +14,15 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// + +#ifdef __x86_64__ + +#include "llama.cpp/ggml-impl.h" +#include "llama.cpp/ggml-quants.h" +#include "sgemm.h" + +// clang-format off + // This matrix - vector and matrix - matrix multiplication implementation // for k-quants and IQ4_XS makes prompt processing 150-200% faster // compared to mainline llama.cpp (and llamafile). @@ -26,7 +34,6 @@ // multiplication (as needed for prompt processing), we can get // a significant speedup by reusing the unpacked QX quants and scales // for multiplication with several Q8_K columns. -// namespace { @@ -710,6 +717,8 @@ static void mul_mat_iq4_xs_q8_K_T(int n, float * s, size_t bs, const void * vx, } +} // namespace + // // ============================== Matrix multiplications // @@ -777,4 +786,4 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const v return true; } -} +#endif // __x86_64__ diff --git a/llamafile/rocm.bat b/llamafile/rocm.bat index 58874d3c69..c76fd9aa70 100644 --- a/llamafile/rocm.bat +++ b/llamafile/rocm.bat @@ -31,6 +31,7 @@ -DGGML_CUDA_MMV_Y=1 ^ -DGGML_USE_HIPBLAS ^ -DGGML_USE_TINYBLAS ^ + -DGGML_MINIMIZE_CODE_SIZE ^ -DK_QUANTS_PER_ITERATION=2 ^ -D_CRT_SECURE_NO_WARNINGS ^ -D_XOPEN_SOURCE=600 ^ diff --git a/llamafile/rocm.sh b/llamafile/rocm.sh index b7091ec2bd..444922555c 100755 --- a/llamafile/rocm.sh +++ b/llamafile/rocm.sh @@ -14,12 +14,13 @@ hipcc \ -DGGML_SHARED=1 \ -Wno-return-type \ -Wno-unused-result \ - -DGGML_USE_HIPBLAS \ - -DGGML_USE_TINYBLAS \ -DGGML_CUDA_MMV_Y=1 \ -DGGML_MULTIPLATFORM \ + -DGGML_USE_HIPBLAS=1 \ + -DGGML_USE_TINYBLAS=1 \ -DGGML_CUDA_DMMV_X=32 \ -DK_QUANTS_PER_ITERATION=2 \ + -DGGML_MINIMIZE_CODE_SIZE=1 \ -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ --amdgpu-target=gfx1100,gfx1031,gfx1030,gfx1032,gfx906,gfx1101,gfx1102,gfx1103 \ -o ggml-rocm.so \ diff --git a/llamafile/sgemm.h b/llamafile/sgemm.h index 5ac2d97457..e9a99577c8 100644 --- a/llamafile/sgemm.h +++ b/llamafile/sgemm.h @@ -7,6 +7,8 @@ extern "C" { struct ggml_tensor; struct ggml_compute_params; +bool iqk_mul_mat(long, long, long, int, const void *, const void *, float *, long, int, int); + bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int, int, int, int, int, int, int); bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *, diff --git a/llamafile/tinyblas_cpu_mixmul.inc b/llamafile/tinyblas_cpu_mixmul.inc index a841114d92..9e42d87400 100644 --- a/llamafile/tinyblas_cpu_mixmul.inc +++ b/llamafile/tinyblas_cpu_mixmul.inc @@ -147,10 +147,6 @@ class MixMul { switch (result->type) { case GGML_TYPE_F32: return mixmuler(); - case GGML_TYPE_BF16: - return mixmuler(); - case GGML_TYPE_F16: - return mixmuler(); default: return false; } diff --git a/llamafile/tinyblas_cpu_sgemm.inc b/llamafile/tinyblas_cpu_sgemm.inc index f90307f726..5d7d97a1e0 100644 --- a/llamafile/tinyblas_cpu_sgemm.inc +++ b/llamafile/tinyblas_cpu_sgemm.inc @@ -269,10 +269,6 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const } // namespace -#if defined __AVX2__ && QK_K == 256 -#include "iqk_mul_mat.inc" -#endif - /** * Performs optimized matrix multiplication on CPU. * @@ -317,10 +313,12 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void assert(nth > 0); assert(ith < nth); -#if defined __AVX2__ && QK_K == 256 - if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { - if (iqk_mul_mat(m, n, k*QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) { - return true; +#if defined(__x86_64__) && QK_K == 256 + if (X86_HAVE(AVX2) && X86_HAVE(FMA)) { + if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { + if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) { + return true; + } } } #endif @@ -329,12 +327,6 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void case GGML_TYPE_F32: return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float *)C, ldc, ith, nth, task, Atype, Btype, Ctype, precision); - case GGML_TYPE_BF16: - return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (ggml_bf16_t *)C, ldc, ith, nth, task, - Atype, Btype, Ctype, precision); - case GGML_TYPE_F16: - return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (ggml_fp16_t *)C, ldc, ith, nth, task, - Atype, Btype, Ctype, precision); default: return NOT_SUPPORTED; } diff --git a/llamafile/v_expf.c b/llamafile/v_expf.c deleted file mode 100644 index 4127df9203..0000000000 --- a/llamafile/v_expf.c +++ /dev/null @@ -1,145 +0,0 @@ -// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi -// -// Copyright 1999-2022 Arm Limited -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef __aarch64__ - -#include - -#define V4(X) {X, X, X, X} -#define NOINLINE __attribute__((__noinline__)) -#define VPCS_ATTR __attribute__((aarch64_vector_pcs)) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(x, 0) - -static inline uint32x4_t v_u32(uint32_t x) { - return (uint32x4_t)V4(x); -} - -/* true if any elements of a vector compare result is non-zero. */ -static inline int v_any_u32(uint32x4_t x) { - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0; -} - -static const struct data { - float32x4_t poly[5]; - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; -#if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; -#endif -} data = { - /* maxerr: 1.45358 +0.5 ulp. */ - .poly = {V4(0x1.0e4020p-7f), V4(0x1.573e2ep-5f), V4(0x1.555e66p-3f), V4(0x1.fffdb6p-2f), - V4(0x1.ffffecp-1f)}, - .shift = V4(0x1.8p23f), - .inv_ln2 = V4(0x1.715476p+0f), - .ln2_hi = V4(0x1.62e4p-1f), - .ln2_lo = V4(0x1.7f7d1cp-20f), - .exponent_bias = V4(0x3f800000), -#if !WANT_SIMD_EXCEPT - .special_bound = V4(126.0f), - .scale_thresh = V4(192.0f), -#endif -}; - -#define C(i) d->poly[i] - -#if WANT_SIMD_EXCEPT - -#define TinyBound v_u32(0x20000000) /* asuint (0x1p-63). */ -#define BigBound v_u32(0x42800000) /* asuint (0x1p6). */ -#define SpecialBound v_u32(0x22800000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE special_case(float32x4_t x, float32x4_t y, uint32x4_t cmp) { - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32(expf, x, y, cmp); -} - -#else - -#define SpecialOffset v_u32(0x82000000) -#define SpecialBias v_u32(0x7f000000) - -static float32x4_t VPCS_ATTR NOINLINE special_case(float32x4_t poly, float32x4_t n, uint32x4_t e, - uint32x4_t cmp1, float32x4_t scale, - const struct data *d) { - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32(vclezq_f32(n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(b, SpecialBias)); - float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, b)); - uint32x4_t cmp2 = vcagtq_f32(n, d->scale_thresh); - float32x4_t r2 = vmulq_f32(s1, s1); - float32x4_t r1 = vmulq_f32(vfmaq_f32(s2, poly, s2), s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - float32x4_t r0 = vfmaq_f32(scale, poly, scale); - float32x4_t r = vbslq_f32(cmp1, r1, r0); - return vbslq_f32(cmp2, r2, r); -} - -#endif - -float32x4_t v_expf(float32x4_t x) { - const struct data *d = &data; - __asm__("" : "+r"(d)); - float32x4_t n, r, r2, scale, p, q, poly, z; - uint32x4_t cmp, e; - -#if WANT_SIMD_EXCEPT - /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32(vsubq_u32(vandq_u32(vreinterpretq_u32_f32(x), v_u32(0x7fffffff)), TinyBound), - SpecialBound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (unlikely(v_any_u32(cmp))) - x = vbslq_f32(cmp, v_f32(1), x); -#endif - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32(d->shift, x, d->inv_ln2); - n = vsubq_f32(z, d->shift); - r = vfmsq_f32(x, n, d->ln2_hi); - r = vfmsq_f32(r, n, d->ln2_lo); - e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); - scale = vreinterpretq_f32_u32(vaddq_u32(e, d->exponent_bias)); - -#if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32(n, d->special_bound); -#endif - - r2 = vmulq_f32(r, r); - p = vfmaq_f32(C(1), C(0), r); - q = vfmaq_f32(C(3), C(2), r); - q = vfmaq_f32(q, p, r2); - p = vmulq_f32(C(4), r); - poly = vfmaq_f32(p, q, r2); - - if (unlikely(v_any_u32(cmp))) -#if WANT_SIMD_EXCEPT - return special_case(xm, vfmaq_f32(scale, poly, scale), cmp); -#else - return special_case(poly, n, e, cmp, scale, d); -#endif - - return vfmaq_f32(scale, poly, scale); -} - -#endif // __aarch64__ diff --git a/llamafile/version.h b/llamafile/version.h index 881249e70f..7ed1afe3d0 100644 --- a/llamafile/version.h +++ b/llamafile/version.h @@ -2,7 +2,7 @@ #define LLAMAFILE_MAJOR 0 #define LLAMAFILE_MINOR 8 -#define LLAMAFILE_PATCH 1 +#define LLAMAFILE_PATCH 2 #define LLAMAFILE_VERSION \ (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH)