From 4ee1e398273d63d5a6a9554d89eeabb784568f36 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Tue, 7 May 2024 22:32:28 -0700
Subject: [PATCH] Release llamafile v0.8.2

- Upgrade to cosmocc 3.3.6
- Remove warnings from cuda build
- Fix bug in llamafile_trapping_enabled
- Refactor the new vectorized expf() code
- iqk_mul_mat() only needs codegen for AVX2
- Be less gung ho about the -ngl flag in README
- Restore shell scriptabiilty fix for new tokenizer
- Suppress divide by zero errors llama_print_timings()
- Cut back on tinyBLAS CPU multiple output type kernels
- Cut back NVIDIA fat binary releases to -arch=all-major
- Remove GA (won't rely on slow broken irregular cloud dev tools)
- Cut flash_attn_ext from release binaries (use --recompile to have it)
---
 .github/workflows/ci.yml                      |  42 ----
 README.md                                     |  12 +-
 build/config.mk                               |   7 +-
 llama.cpp/ggml-cuda.cu                        |  39 +++-
 llama.cpp/ggml-vector.inc                     | 208 ++++++++++++------
 llama.cpp/ggml.c                              |   5 +-
 llama.cpp/llama.cpp                           |   4 +
 llama.cpp/llava/clip.cpp                      |   3 +-
 llama.cpp/llava/llava-cli.cpp                 |   3 +-
 llama.cpp/main/embedding.cpp                  |   1 -
 llama.cpp/main/main.cpp                       |  11 +-
 llamafile/BUILD.mk                            |   5 +
 llamafile/cuda.bat                            |   8 +-
 llamafile/cuda.sh                             |  11 +-
 llamafile/expf.h                              | 146 ------------
 llamafile/get_app_dir.c                       |  10 +-
 .../{iqk_mul_mat.inc => iqk_mul_mat.cpp}      |  15 +-
 llamafile/rocm.bat                            |   1 +
 llamafile/rocm.sh                             |   5 +-
 llamafile/sgemm.h                             |   2 +
 llamafile/tinyblas_cpu_mixmul.inc             |   4 -
 llamafile/tinyblas_cpu_sgemm.inc              |  20 +-
 llamafile/v_expf.c                            | 145 ------------
 llamafile/version.h                           |   2 +-
 24 files changed, 242 insertions(+), 467 deletions(-)
 delete mode 100644 .github/workflows/ci.yml
 delete mode 100644 llamafile/expf.h
 rename llamafile/{iqk_mul_mat.inc => iqk_mul_mat.cpp} (99%)
 delete mode 100644 llamafile/v_expf.c

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index ad99cb348f..0000000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: CI
-on:
-  push:
-    branches: [ master, main, fix ]
-  pull_request:
-    branches: [ master, main, fix ]
-jobs:
-  Tests:
-    timeout-minutes: 60
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ macos-latest ] # ubuntu-latest, windows-latest are currently non-functional, requiring adaptation for proper functionality. 
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-      - name: Build
-        shell: bash
-        run: |
-          curl -L -o make https://cosmo.zip/pub/cosmos/bin/make
-          chmod +x make
-          ./make -j8
-      - name: Make Llamafile
-        shell: bash 
-        run: |
-          curl -L -o tinyllama.gguf https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q4_0.gguf
-          cat << EoF > .args
-          -m
-          tinyllama.gguf
-          ...
-          EoF
-          cp o//llama.cpp/main/main \
-            tinyllama.llamafile
-          o//llamafile/zipalign -j0 \
-            tinyllama.llamafile \
-            tinyllama.gguf \
-            .args
-      - name: Execute LLM CLI CPU  # GA doesn't have "support_simdgroup_reduction" for RMS_NORM :'(
-        shell: bash
-        run: |
-          ./tinyllama.llamafile -e -p '## Famous Speech\n\nFour score and seven' -n 50 -ngl 0
diff --git a/README.md b/README.md
index b38175d497..9e9a000cda 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ chmod +x llava-v1.5-7b-q4.llamafile
 5. Run the llamafile. e.g.:
 
 ```sh
-./llava-v1.5-7b-q4.llamafile -ngl 9999
+./llava-v1.5-7b-q4.llamafile
 ```
 
 6. Your browser should open automatically and display a chat interface. 
@@ -184,19 +184,19 @@ try out llamafile with different kinds of LLMs.
 Here is an example for the Mistral command-line llamafile:
 
 ```sh
-./mistral-7b-instruct-v0.2.Q5_K_M.llamafile -ngl 9999 --temp 0.7 -p '[INST]Write a story about llamas[/INST]'
+./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --temp 0.7 -p '[INST]Write a story about llamas[/INST]'
 ```
 
 And here is an example for WizardCoder-Python command-line llamafile:
 
 ```sh
-./wizardcoder-python-13b.llamafile -ngl 9999 --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n'
+./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n'
 ```
 
 And here's an example for the LLaVA command-line llamafile:
 
 ```sh
-./llava-v1.5-7b-q4.llamafile -ngl 9999 --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:'
+./llava-v1.5-7b-q4.llamafile --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:'
 ```
 
 As before, macOS, Linux, and BSD users will need to use the "chmod"
@@ -266,7 +266,7 @@ For Windows users, here's an example for the Mistral LLM:
 ```sh
 curl -L -o llamafile.exe https://github.com/Mozilla-Ocho/llamafile/releases/download/0.6/llamafile-0.6
 curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
-./llamafile.exe -m mistral.gguf -ngl 9999
+./llamafile.exe -m mistral.gguf
 ```
 
 Windows users may need to change `./llamafile.exe` to `.\llamafile.exe`
@@ -423,7 +423,7 @@ llama.cpp command line interface, utilizing WizardCoder-Python-13B
 weights:
 
 ```sh
-llamafile -ngl 9999 \
+llamafile \
   -m wizardcoder-python-13b-v1.0.Q8_0.gguf \
   --temp 0 -r '}\n' -r '```\n' \
   -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n'
diff --git a/build/config.mk b/build/config.mk
index e08baa3248..8770dfe28e 100644
--- a/build/config.mk
+++ b/build/config.mk
@@ -2,7 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 PREFIX = /usr/local
-COSMOCC = .cosmocc/3.3.5
+COSMOCC = .cosmocc/3.3.6
 TOOLCHAIN = $(COSMOCC)/bin/cosmo
 
 AR = $(TOOLCHAIN)ar
@@ -50,6 +50,7 @@ clean:; rm -rf o
 .PHONY: distclean
 distclean:; rm -rf o .cosmocc
 
-.cosmocc/3.3.5:
-	build/download-cosmocc.sh $@ 3.3.5 db78fd8d3f8706e9dff4be72bf71d37a3f12062f212f407e1c33bc4af3780dd0
+.cosmocc/3.3.6:
+	build/download-cosmocc.sh $@ 3.3.6 26e3449357f31b82489774ef5c2d502a711bb711d4faf99a5fd6c96328a1c205
+
 
diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu
index d55a587fc5..3020013c58 100644
--- a/llama.cpp/ggml-cuda.cu
+++ b/llama.cpp/ggml-cuda.cu
@@ -212,7 +212,6 @@
 #include "ggml-backend-impl.h"
 
 static const struct ggml_backend_api *g_backend;
-#define exit g_backend->exit
 #define getenv g_backend->getenv
 #define FLAG_log_disable (*g_backend->FLAG_log_disable)
 #define ggml_backend_register g_backend->ggml_backend_register
@@ -242,6 +241,18 @@ static const struct ggml_backend_api *g_backend;
 #define ggml_is_empty g_backend->ggml_is_empty
 #define ggml_op_desc g_backend->ggml_op_desc
 
+[[noreturn]]
+static void exit_(int rc) {
+    g_backend->exit(rc);
+#define exit exit_
+#if defined(__GNUC__) || defined(__llvm__)
+    __builtin_unreachable();
+#elif defined(_MSC_VER)
+    __assume(0);
+#endif
+    for (;;);
+}
+
 // printf() and fprintf() runtime bridge
 // this is needed so text gets printed on windows
 // it also helps ensure the atomicity of log lines
@@ -484,6 +495,14 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 }
 
+#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+                        defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL)
+#define FP16_MMA_AVAILABLE (!(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA)
+#if FP16_MMA_AVAILABLE
+#include <mma.h>
+#endif
+
+#if defined(GGML_MINIMIZE_CODE_SIZE) && FP16_AVAILABLE // [jart]
 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 #pragma unroll
@@ -496,6 +515,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
+#endif // [jart]
 
 #if CUDART_VERSION < CUDART_HMASK
 static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
@@ -588,15 +608,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)
 
-#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
-    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
-
-#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
-
-#if FP16_MMA_AVAILABLE
-#include <mma.h>
-#endif
-
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 
@@ -823,7 +834,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream);
 
+#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+#endif
 
 #define CUDA_GET_ROWS_BLOCK_SIZE 256
 
@@ -5785,6 +5798,7 @@ template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_
     launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
 }
 
+#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * Q = dst->src[0];
     const ggml_tensor * K = dst->src[1];
@@ -5966,6 +5980,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     }
     return;
 }
+#endif // GGML_MINIMIZE_CODE_SIZE [jart]
 
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
@@ -12501,9 +12516,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
+#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
         case GGML_OP_FLASH_ATTN_EXT:
             ggml_cuda_flash_attn_ext(ctx, dst);
             break;
+#endif
         default:
             return false;
     }
@@ -12778,7 +12795,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
+#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
         case GGML_OP_FLASH_ATTN_EXT:
+#endif
             return true;
         default:
             return false;
diff --git a/llama.cpp/ggml-vector.inc b/llama.cpp/ggml-vector.inc
index ead2de864a..58e25dce64 100644
--- a/llama.cpp/ggml-vector.inc
+++ b/llama.cpp/ggml-vector.inc
@@ -3,7 +3,6 @@
 
 #include "ggml-vector.h"
 #include "ggml-impl.h"
-#include "llamafile/expf.h"
 #include "llamafile/llamafile.h"
 #include <stdatomic.h>
 
@@ -1185,36 +1184,157 @@ void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
     }
 }
 
-void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    int i = 0;
+#if defined(__ARM_NEON)
 
-#if defined(__AVX512F__)
-    if (!FLAG_trap && !FLAG_precise) {
-        for (; i + 15 < n; i += 16) {
-            _mm512_storeu_ps(y + i, llamafile_silu_avx512(_mm512_loadu_ps(x + i)));
-        }
-    }
+// adapted from arm limited optimized routine
+// by justine tunney on the llamafile project
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static float32x4_t ggml_v_expf(float32x4_t x) {
+    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
+    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
+    const float32x4_t n = vsubq_f32(z, r);
+    const float32x4_t b =
+        vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, vdupq_n_f32(0x1.7f7d1cp-20f));
+    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
+    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vdupq_n_u32(0x3f800000)));
+    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
+    const float32x4_t u = vmulq_f32(b, b);
+    const float32x4_t j = vfmaq_f32(
+        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
+        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
+                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u),
+        u);
+    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
+        return vfmaq_f32(k, j, k);
+    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
+    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
+    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
+    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
+                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static float32x4_t ggml_v_silu(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t zero = vdupq_n_f32(0.0f);
+    const float32x4_t neg_x = vsubq_f32(zero, x);
+    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
+    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+    return vdivq_f32(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__SSE2__) // __ARM_NEON
+
+#if defined(__FMA__)
+#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
+#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
+#else
+#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
+#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
 #endif
 
-#if defined(__AVX2__) && defined(__FMA__)
-    if (!FLAG_trap && !FLAG_precise) {
-        for (; i + 7 < n; i += 8) {
-            _mm256_storeu_ps(y + i, llamafile_silu_avx2fma(_mm256_loadu_ps(x + i)));
-        }
+// adapted from arm limited optimized routine
+// by justine tunney on the llamafile project
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m128 ggml_v_expf(__m128 x) {
+    const __m128 r = _mm_set1_ps(0x1.8p23f);
+    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
+    const __m128 n = _mm_sub_ps(z, r);
+    const __m128 b =
+        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
+    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
+    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
+    const __m128i c =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
+    const __m128 u = _mm_mul_ps(b, b);
+    const __m128 j =
+        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
+                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
+                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
+    if (!_mm_movemask_epi8(c))
+        return MADD128(j, k, k);
+    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
+                                    _mm_set1_epi32(0x82000000u));
+    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
+    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
+    const __m128i d =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
+    return _mm_or_ps(
+        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
+        _mm_andnot_ps(_mm_castsi128_ps(d),
+                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
+                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __SSE2__
+
+void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+    if (!FLAG_trap) {
+#if defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
     }
 #endif
+    }
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
 
-#if defined(__ARM_NEON)
+float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
     if (!FLAG_trap) {
-        for (; i + 3 < n; i += 4) {
-            vst1q_f32(y + i, llamafile_silu_neon(vld1q_f32(x + i)));
-        }
+#if defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += vaddvq_f32(val);
     }
 #endif
-
+    }
     for (; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
     }
+    return sum;
 }
 
 float ggml_silu_backward_f32(float x, float dy) {
@@ -1282,51 +1402,3 @@ void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
     }
     *s = idx;
 }
-
-float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
-    int i = 0;
-    ggml_float sum = 0;
-
-#if defined(__AVX512F__)
-    if (!FLAG_trap && !FLAG_precise) {
-        for (; i + 15 < n; i += 16) {
-            __m512 val = llamafile_expf_avx512(_mm512_sub_ps(_mm512_loadu_ps(x + i),
-                                                             _mm512_set1_ps(max)));
-            _mm512_storeu_ps(y + i, val);
-            sum += _mm512_reduce_add_ps(val);
-        }
-    }
-#endif
-
-#if defined(__AVX2__) && defined(__FMA__)
-    if (!FLAG_trap && !FLAG_precise) {
-        for (; i + 7 < n; i += 8) {
-            __m256 val = llamafile_expf_avx2fma(_mm256_sub_ps(_mm256_loadu_ps(x + i),
-                                                              _mm256_set1_ps(max)));
-            _mm256_storeu_ps(y + i, val);
-            __m128 v2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
-                                   _mm256_castps256_ps128(val));
-            v2 = _mm_add_ps(v2, _mm_movehl_ps(v2, v2));
-            v2 = _mm_add_ss(v2, _mm_movehdup_ps(v2));
-            sum += _mm_cvtss_f32(v2);
-        }
-    }
-#endif
-
-#ifdef __ARM_NEON
-    if (!FLAG_trap) {
-        for (; i + 3 < n; i += 4) {
-            float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max)));
-            vst1q_f32(y + i, val);
-            sum += vaddvq_f32(val);
-        }
-    }
-#endif
-
-    for (; i < n; ++i) {
-        float val = expf(x[i] - max);
-        sum += (ggml_float)val;
-        y[i] = val;
-    }
-    return sum;
-}
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
index 83b3b14bef..a90a4ac9da 100644
--- a/llama.cpp/ggml.c
+++ b/llama.cpp/ggml.c
@@ -17756,6 +17756,7 @@ struct ggml_compute_state {
     int ith;
     struct ggml_compute_state_shared * shared;
     enum ggml_status ec;
+    bool is_main_thread; // [jart]
 };
 
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -18064,7 +18065,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     int task_phase = GGML_TASK_TYPE_FINALIZE;
 
 #ifdef LLAMAFILE_DEBUG
-    if (FLAG_trap) {
+    if (FLAG_trap && !state->is_main_thread) {
         llamafile_trapping_enabled(+1);
     }
 #endif
@@ -18522,6 +18523,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                 .ith = j,
                 .shared = &state_shared,
                 .ec = GGML_STATUS_SUCCESS,
+                .is_main_thread = false, // [jart]
             };
 
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -18533,6 +18535,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     workers[0].ith = 0;
     workers[0].shared = &state_shared;
     workers[0].ec = GGML_STATUS_SUCCESS;
+    workers[0].is_main_thread = true; // [jart]
 
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
index 7d7288aaf9..3b341e5aef 100644
--- a/llama.cpp/llama.cpp
+++ b/llama.cpp/llama.cpp
@@ -2,6 +2,7 @@
 // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 #define LLAMA_API_INTERNAL
 #include "llamafile/log.h"
+#include "llamafile/debug.h"
 #include "llama.h"
 
 #include "unicode.h"
@@ -17661,6 +17662,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
 
 void llama_print_timings(struct llama_context * ctx) {
     const llama_timings timings = llama_get_timings(ctx);
+    llamafile_trapping_enabled(-1);  // [jart]
 
     LLAMA_LOG_INFO("\n");
     LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
@@ -17671,6 +17673,8 @@ void llama_print_timings(struct llama_context * ctx) {
     LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
+
+    llamafile_trapping_enabled(+1);  // [jart]
 }
 
 void llama_reset_timings(struct llama_context * ctx) {
diff --git a/llama.cpp/llava/clip.cpp b/llama.cpp/llava/clip.cpp
index dc7e0fb5fc..fdcc8b93cf 100644
--- a/llama.cpp/llava/clip.cpp
+++ b/llama.cpp/llava/clip.cpp
@@ -139,7 +139,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
     int i = gguf_find_key(ctx, key);
     if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        // [jart] don't log to console errors that aren't errors
+        LOG("%s: note: key %s not found in file\n", __func__, key);
         throw std::runtime_error(format("Missing required key: %s", key));
     }
 
diff --git a/llama.cpp/llava/llava-cli.cpp b/llama.cpp/llava/llava-cli.cpp
index 57621ae6a9..1df2af9a7d 100644
--- a/llama.cpp/llava/llava-cli.cpp
+++ b/llama.cpp/llava/llava-cli.cpp
@@ -296,14 +296,13 @@ int llava_cli(int argc, char ** argv, gpt_params & params) {
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
     log_dump_cmdline(argc, argv);
     llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
+        // show_additional_info(argc, argv); // [jart] no help unless we ask for it
         return 1;
     }
     auto model = llava_init(&params);
diff --git a/llama.cpp/main/embedding.cpp b/llama.cpp/main/embedding.cpp
index 4ce980e324..583f88152d 100644
--- a/llama.cpp/main/embedding.cpp
+++ b/llama.cpp/main/embedding.cpp
@@ -78,7 +78,6 @@ int embedding_cli(int argc, char ** argv) {
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(stderr);
-    LOG_TEE("Log start\n");
     log_dump_cmdline(argc, argv);
     llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
index 9e04e2b04b..4c0eb97d24 100644
--- a/llama.cpp/main/main.cpp
+++ b/llama.cpp/main/main.cpp
@@ -179,7 +179,6 @@ int main(int argc, char ** argv) {
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
     log_dump_cmdline(argc, argv);
     llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
@@ -206,7 +205,7 @@ int main(int argc, char ** argv) {
         }
         __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
         if (pledge(0, 0)) {
-            LOG_TEE("warning: this OS doesn't support pledge() security\n");
+            LOG("warning: this OS doesn't support pledge() security\n");
         } else if (pledge(promises, 0)) {
             perror("pledge");
             exit(1);
@@ -594,6 +593,7 @@ int main(int argc, char ** argv) {
         antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
     }
 
+    bool should_show_special_tokens = sparams.grammar.empty(); // [jart] for shell scriptability
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
     if (!ctx_sampling) { // [jart] fixes crash
         fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
@@ -812,7 +812,12 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+
+                // [jart] shell scriptability
+                const std::string token_str =
+                        llama_token_to_piece(
+                            ctx, id, should_show_special_tokens);
+
                 printf("%s", token_str.c_str());
 
                 if (embd.size() > 1) {
diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
index a6114ef45c..58b272b16c 100644
--- a/llamafile/BUILD.mk
+++ b/llamafile/BUILD.mk
@@ -89,6 +89,7 @@ o/$(MODE)/llamafile:					\
 #
 
 o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
+o/$(MODE)/llamafile/iqk_mul_mat.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma
@@ -134,6 +135,10 @@ o/$(MODE)/llamafile/tinyblas_test:			\
 		o/$(MODE)/llamafile/tester.o
 	build/cudacc -g -o $@ $^ -lcublas
 
+o/$(MODE)/llamafile/compcap:				\
+		o/$(MODE)/llamafile/compcap.o
+	build/cudacc -g -o $@ $^ -lcublas
+
 o/$(MODE)/llamafile/cudaprops:				\
 		o/$(MODE)/llamafile/cudaprops.o		\
 		o/$(MODE)/llamafile/tester.o
diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat
index 4f1997eb9f..4dd5b6d6bc 100644
--- a/llamafile/cuda.bat
+++ b/llamafile/cuda.bat
@@ -5,12 +5,7 @@
 :: driver on a Windows system that has a CUDA-capable GPU installed.
 
 nvcc --shared ^
-     -gencode=arch=compute_50,code=sm_50 ^
-     -gencode=arch=compute_60,code=sm_60 ^
-     -gencode=arch=compute_70,code=sm_70 ^
-     -gencode=arch=compute_75,code=sm_75 ^
-     -gencode=arch=compute_80,code=sm_80 ^
-     -gencode=arch=compute_90,code=sm_90 ^
+     -arch=all-major ^
      --forward-unknown-to-host-compiler ^
      -Xcompiler="/nologo /EHsc /O2 /GR /MT" ^
      -DNDEBUG ^
@@ -21,6 +16,7 @@ nvcc --shared ^
      -DGGML_CUDA_DMMV_X=32 ^
      -DK_QUANTS_PER_ITERATION=2 ^
      -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 ^
+     -DGGML_MINIMIZE_CODE_SIZE ^
      -DGGML_USE_TINYBLAS ^
      -o ggml-cuda.dll ^
      ggml-cuda.cu ^
diff --git a/llamafile/cuda.sh b/llamafile/cuda.sh
index 99af991b0d..fc4d0c3bfb 100755
--- a/llamafile/cuda.sh
+++ b/llamafile/cuda.sh
@@ -23,15 +23,9 @@ cd "$TMP"
 
 /usr/local/cuda/bin/nvcc \
   --shared \
-  -gencode=arch=compute_50,code=sm_50 \
-  -gencode=arch=compute_60,code=sm_60 \
-  -gencode=arch=compute_70,code=sm_70 \
-  -gencode=arch=compute_75,code=sm_75 \
-  -gencode=arch=compute_80,code=sm_80 \
-  -gencode=arch=compute_90,code=sm_90 \
+  -arch=all-major \
   --forward-unknown-to-host-compiler \
-  --compiler-options \
-  "-fPIC -O3 -march=native -mtune=native" \
+  --compiler-options "-fPIC -O2" \
   -DNDEBUG \
   -DGGML_BUILD=1 \
   -DGGML_SHARED=1 \
@@ -40,6 +34,7 @@ cd "$TMP"
   -DGGML_CUDA_DMMV_X=32 \
   -DK_QUANTS_PER_ITERATION=2 \
   -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
+  -DGGML_MINIMIZE_CODE_SIZE \
   -DGGML_USE_TINYBLAS \
   -o ~/ggml-cuda.so \
   ggml-cuda.cu \
diff --git a/llamafile/expf.h b/llamafile/expf.h
deleted file mode 100644
index 200829546f..0000000000
--- a/llamafile/expf.h
+++ /dev/null
@@ -1,146 +0,0 @@
-// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
-// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
-//
-// Copyright 2023 Mozilla Foundation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef __AVX512F__
-#include <immintrin.h>
-
-// computes expf() for each element in vector.
-//
-// the maximum error is 1.45358 +0.5 ulp. the only difference between
-// this function and expf(), is that there's currently no support for
-// subnormals. input values are clamped to range: [-87.6831, 88.3763]
-// whereas expf() allows inputs as low as -103.972. therefore numbers
-// will be flushed to zero sooner than they otherwise would with this
-// function. nearest rounding mode is always used. exception trapping
-// isn't supported although this function does a good job avoiding it
-//
-static inline __m512 llamafile_expf_avx512(__m512 x) {
-    __m512 a, b, c, d, e, f, g;
-    __m512 will_turn_into_inf = _mm512_set1_ps(0x1.62e44p+6f);
-    __m512 max_before_overflow = _mm512_set1_ps(0x1.61814cp+6f);
-    __m512 min_before_underflow = _mm512_set1_ps(-0x1.5ebb86p+6f);
-    x = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(x, max_before_overflow, _CMP_GE_OQ), x,
-                             will_turn_into_inf);
-    x = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(x, min_before_underflow, _CMP_LE_OQ), x,
-                             min_before_underflow);
-    a = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.715476p+0f), x, _mm512_set1_ps(0x1.8p23f),
-                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    b = _mm512_sub_ps(a, _mm512_set1_ps(0x1.8p23f));
-    c = _mm512_fnmadd_round_ps(_mm512_set1_ps(0x1.62e4p-1f), b, x,
-                               _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    c = _mm512_fnmadd_round_ps(_mm512_set1_ps(0x1.7f7d1cp-20f), b, c,
-                               _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    d = _mm512_castsi512_ps(_mm512_add_epi32(_mm512_slli_epi32(_mm512_castps_si512(a), 23),
-                                             _mm512_set1_epi32(0x3f800000u)));
-    e = _mm512_mul_round_ps(c, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    f = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.0e4020p-7f), c, _mm512_set1_ps(0x1.573e2ep-5f),
-                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    g = _mm512_fmadd_round_ps(_mm512_set1_ps(0x1.555e66p-3f), c, _mm512_set1_ps(0x1.fffdb6p-2f),
-                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    g = _mm512_fmadd_round_ps(f, e, g, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    f = _mm512_mul_round_ps(_mm512_set1_ps(0x1.ffffecp-1f), c,
-                            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    return _mm512_fmadd_round_ps(
-        _mm512_fmadd_round_ps(g, e, f, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), d, d,
-        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-// computes silu x/(1+exp(-x)) in single precision
-static inline __m512 llamafile_silu_avx512(__m512 x) {
-    __m512 one = _mm512_set1_ps(1);
-    __m512 zero = _mm512_setzero_ps();
-    __m512 neg_x = _mm512_sub_ps(zero, x);
-    __m512 exp_neg_x = llamafile_expf_avx512(neg_x);
-    __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
-    return _mm512_div_ps(x, one_plus_exp_neg_x);
-}
-
-#endif // __AVX512F__
-
-#if defined(__AVX2__) && defined(__FMA__)
-#include <immintrin.h>
-
-// computes expf() for each element in vector.
-//
-// the maximum error is 1.45358 +0.5 ulp. the only difference between
-// this function and expf(), is that there's currently no support for
-// subnormals. input values are clamped to range: [-87.6831, 88.3763]
-// whereas expf() allows inputs as low as -103.972. therefore numbers
-// will be flushed to zero sooner than they otherwise would with this
-// function. exception trapping isnt supported although this function
-// does a good job avoiding it.
-//
-static inline __m256 llamafile_expf_avx2fma(__m256 x) {
-    __m256 a, b, c, d, e, f, g;
-    __m256 will_turn_into_inf = _mm256_set1_ps(0x1.62e44p+6f);
-    __m256 max_before_overflow = _mm256_set1_ps(0x1.61814cp+6f);
-    __m256 min_before_underflow = _mm256_set1_ps(-0x1.5ebb86p+6f);
-    __m256 min_mask = _mm256_cmp_ps(x, min_before_underflow, _CMP_LE_OQ);
-    __m256 max_mask = _mm256_cmp_ps(x, max_before_overflow, _CMP_GE_OQ);
-    x = _mm256_or_ps(_mm256_and_ps(min_mask, min_before_underflow), _mm256_andnot_ps(min_mask, x));
-    x = _mm256_or_ps(_mm256_and_ps(max_mask, will_turn_into_inf), _mm256_andnot_ps(max_mask, x));
-    a = _mm256_fmadd_ps(_mm256_set1_ps(0x1.715476p+0f), x, _mm256_set1_ps(0x1.8p23f));
-    b = _mm256_sub_ps(a, _mm256_set1_ps(0x1.8p23f));
-    c = _mm256_fnmadd_ps(_mm256_set1_ps(0x1.62e4p-1f), b, x);
-    c = _mm256_fnmadd_ps(_mm256_set1_ps(0x1.7f7d1cp-20f), b, c);
-    d = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_slli_epi32(_mm256_castps_si256(a), 23),
-                                             _mm256_set1_epi32(0x3f800000u)));
-    e = _mm256_mul_ps(c, c);
-    f = _mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), c, _mm256_set1_ps(0x1.573e2ep-5f));
-    g = _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), c, _mm256_set1_ps(0x1.fffdb6p-2f));
-    g = _mm256_fmadd_ps(f, e, g);
-    f = _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), c);
-    return _mm256_fmadd_ps(_mm256_fmadd_ps(g, e, f), d, d);
-}
-
-// computes silu x/(1+exp(-x)) in single precision
-static inline __m256 llamafile_silu_avx2fma(__m256 x) {
-    __m256 one = _mm256_set1_ps(1);
-    __m256 zero = _mm256_setzero_ps();
-    __m256 neg_x = _mm256_sub_ps(zero, x);
-    __m256 exp_neg_x = llamafile_expf_avx2fma(neg_x);
-    __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
-    return _mm256_div_ps(x, one_plus_exp_neg_x);
-}
-
-#endif // __AVX2__
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-
-float32x4_t v_expf(float32x4_t);
-
-static inline float32x4_t llamafile_expf_neon(float32x4_t x) {
-    return v_expf(x);
-}
-
-static inline float32x4_t llamafile_silu_neon(float32x4_t x) {
-    float32x4_t one = vdupq_n_f32(1.0f);
-    float32x4_t zero = vdupq_n_f32(0.0f);
-    float32x4_t neg_x = vsubq_f32(zero, x);
-    float32x4_t exp_neg_x = llamafile_expf_neon(neg_x);
-    float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
-    return vdivq_f32(x, one_plus_exp_neg_x);
-}
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/llamafile/get_app_dir.c b/llamafile/get_app_dir.c
index 957473d914..cb95afd610 100644
--- a/llamafile/get_app_dir.c
+++ b/llamafile/get_app_dir.c
@@ -16,12 +16,20 @@
 // limitations under the License.
 
 #include "llamafile.h"
+#include <stdlib.h>
 #include <string.h>
 
+static const char *llamafile_get_home_dir(void) {
+    const char *homedir;
+    if (!(homedir = getenv("HOME")) || !*homedir)
+        homedir = ".";
+    return homedir;
+}
+
 /**
  * Returns path of directory for app-specific files.
  */
 void llamafile_get_app_dir(char *path, size_t size) {
-    strlcpy(path, llamafile_get_tmp_dir(), size);
+    strlcpy(path, llamafile_get_home_dir(), size);
     strlcat(path, "/.llamafile/", size);
 }
diff --git a/llamafile/iqk_mul_mat.inc b/llamafile/iqk_mul_mat.cpp
similarity index 99%
rename from llamafile/iqk_mul_mat.inc
rename to llamafile/iqk_mul_mat.cpp
index 2ffa2b5e41..112544d7a3 100644
--- a/llamafile/iqk_mul_mat.inc
+++ b/llamafile/iqk_mul_mat.cpp
@@ -14,7 +14,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-//
+
+#ifdef __x86_64__
+
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "sgemm.h"
+
+// clang-format off
+
 // This matrix - vector and matrix - matrix multiplication implementation
 // for k-quants and IQ4_XS makes prompt processing 150-200% faster
 // compared to mainline llama.cpp (and llamafile).
@@ -26,7 +34,6 @@
 // multiplication (as needed for prompt processing), we can get
 // a significant speedup by reusing the unpacked QX quants and scales
 // for multiplication with several Q8_K columns.
-//
 
 namespace {
 
@@ -710,6 +717,8 @@ static void mul_mat_iq4_xs_q8_K_T(int n, float * s, size_t bs, const void * vx,
 
 }
 
+} // namespace
+
 //
 // ============================== Matrix multiplications
 //
@@ -777,4 +786,4 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const v
     return true;
 }
 
-}
+#endif // __x86_64__
diff --git a/llamafile/rocm.bat b/llamafile/rocm.bat
index 58874d3c69..c76fd9aa70 100644
--- a/llamafile/rocm.bat
+++ b/llamafile/rocm.bat
@@ -31,6 +31,7 @@
   -DGGML_CUDA_MMV_Y=1 ^
   -DGGML_USE_HIPBLAS ^
   -DGGML_USE_TINYBLAS ^
+  -DGGML_MINIMIZE_CODE_SIZE ^
   -DK_QUANTS_PER_ITERATION=2 ^
   -D_CRT_SECURE_NO_WARNINGS ^
   -D_XOPEN_SOURCE=600 ^
diff --git a/llamafile/rocm.sh b/llamafile/rocm.sh
index b7091ec2bd..444922555c 100755
--- a/llamafile/rocm.sh
+++ b/llamafile/rocm.sh
@@ -14,12 +14,13 @@ hipcc \
   -DGGML_SHARED=1 \
   -Wno-return-type \
   -Wno-unused-result \
-  -DGGML_USE_HIPBLAS \
-  -DGGML_USE_TINYBLAS \
   -DGGML_CUDA_MMV_Y=1 \
   -DGGML_MULTIPLATFORM \
+  -DGGML_USE_HIPBLAS=1 \
+  -DGGML_USE_TINYBLAS=1 \
   -DGGML_CUDA_DMMV_X=32 \
   -DK_QUANTS_PER_ITERATION=2 \
+  -DGGML_MINIMIZE_CODE_SIZE=1 \
   -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
   --amdgpu-target=gfx1100,gfx1031,gfx1030,gfx1032,gfx906,gfx1101,gfx1102,gfx1103 \
   -o ggml-rocm.so \
diff --git a/llamafile/sgemm.h b/llamafile/sgemm.h
index 5ac2d97457..e9a99577c8 100644
--- a/llamafile/sgemm.h
+++ b/llamafile/sgemm.h
@@ -7,6 +7,8 @@ extern "C" {
 struct ggml_tensor;
 struct ggml_compute_params;
 
+bool iqk_mul_mat(long, long, long, int, const void *, const void *, float *, long, int, int);
+
 bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
                      int, int, int, int, int, int);
 bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *,
diff --git a/llamafile/tinyblas_cpu_mixmul.inc b/llamafile/tinyblas_cpu_mixmul.inc
index a841114d92..9e42d87400 100644
--- a/llamafile/tinyblas_cpu_mixmul.inc
+++ b/llamafile/tinyblas_cpu_mixmul.inc
@@ -147,10 +147,6 @@ class MixMul {
         switch (result->type) {
         case GGML_TYPE_F32:
             return mixmuler<float>();
-        case GGML_TYPE_BF16:
-            return mixmuler<ggml_bf16_t>();
-        case GGML_TYPE_F16:
-            return mixmuler<ggml_fp16_t>();
         default:
             return false;
         }
diff --git a/llamafile/tinyblas_cpu_sgemm.inc b/llamafile/tinyblas_cpu_sgemm.inc
index f90307f726..5d7d97a1e0 100644
--- a/llamafile/tinyblas_cpu_sgemm.inc
+++ b/llamafile/tinyblas_cpu_sgemm.inc
@@ -269,10 +269,6 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
 
 } // namespace
 
-#if defined __AVX2__ && QK_K == 256
-#include "iqk_mul_mat.inc"
-#endif
-
 /**
  * Performs optimized matrix multiplication on CPU.
  *
@@ -317,10 +313,12 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void
     assert(nth > 0);
     assert(ith < nth);
 
-#if defined __AVX2__ && QK_K == 256
-    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
-        if (iqk_mul_mat(m, n, k*QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) {
-            return true;
+#if defined(__x86_64__) && QK_K == 256
+    if (X86_HAVE(AVX2) && X86_HAVE(FMA)) {
+        if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
+            if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) {
+                return true;
+            }
         }
     }
 #endif
@@ -329,12 +327,6 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void
     case GGML_TYPE_F32:
         return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float *)C, ldc, ith, nth, task, Atype,
                                     Btype, Ctype, precision);
-    case GGML_TYPE_BF16:
-        return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (ggml_bf16_t *)C, ldc, ith, nth, task,
-                                    Atype, Btype, Ctype, precision);
-    case GGML_TYPE_F16:
-        return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (ggml_fp16_t *)C, ldc, ith, nth, task,
-                                    Atype, Btype, Ctype, precision);
     default:
         return NOT_SUPPORTED;
     }
diff --git a/llamafile/v_expf.c b/llamafile/v_expf.c
deleted file mode 100644
index 4127df9203..0000000000
--- a/llamafile/v_expf.c
+++ /dev/null
@@ -1,145 +0,0 @@
-// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
-// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
-//
-// Copyright 1999-2022 Arm Limited
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#define V4(X) {X, X, X, X}
-#define NOINLINE __attribute__((__noinline__))
-#define VPCS_ATTR __attribute__((aarch64_vector_pcs))
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(x, 0)
-
-static inline uint32x4_t v_u32(uint32_t x) {
-    return (uint32x4_t)V4(x);
-}
-
-/* true if any elements of a vector compare result is non-zero.  */
-static inline int v_any_u32(uint32x4_t x) {
-    /* assume elements in x are either 0 or -1u.  */
-    return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0;
-}
-
-static const struct data {
-    float32x4_t poly[5];
-    float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
-    uint32x4_t exponent_bias;
-#if !WANT_SIMD_EXCEPT
-    float32x4_t special_bound, scale_thresh;
-#endif
-} data = {
-    /* maxerr: 1.45358 +0.5 ulp.  */
-    .poly = {V4(0x1.0e4020p-7f), V4(0x1.573e2ep-5f), V4(0x1.555e66p-3f), V4(0x1.fffdb6p-2f),
-             V4(0x1.ffffecp-1f)},
-    .shift = V4(0x1.8p23f),
-    .inv_ln2 = V4(0x1.715476p+0f),
-    .ln2_hi = V4(0x1.62e4p-1f),
-    .ln2_lo = V4(0x1.7f7d1cp-20f),
-    .exponent_bias = V4(0x3f800000),
-#if !WANT_SIMD_EXCEPT
-    .special_bound = V4(126.0f),
-    .scale_thresh = V4(192.0f),
-#endif
-};
-
-#define C(i) d->poly[i]
-
-#if WANT_SIMD_EXCEPT
-
-#define TinyBound v_u32(0x20000000) /* asuint (0x1p-63).  */
-#define BigBound v_u32(0x42800000) /* asuint (0x1p6).  */
-#define SpecialBound v_u32(0x22800000) /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE special_case(float32x4_t x, float32x4_t y, uint32x4_t cmp) {
-    /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-       routine to special lanes.  */
-    return v_call_f32(expf, x, y, cmp);
-}
-
-#else
-
-#define SpecialOffset v_u32(0x82000000)
-#define SpecialBias v_u32(0x7f000000)
-
-static float32x4_t VPCS_ATTR NOINLINE special_case(float32x4_t poly, float32x4_t n, uint32x4_t e,
-                                                   uint32x4_t cmp1, float32x4_t scale,
-                                                   const struct data *d) {
-    /* 2^n may overflow, break it up into s1*s2.  */
-    uint32x4_t b = vandq_u32(vclezq_f32(n), SpecialOffset);
-    float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(b, SpecialBias));
-    float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, b));
-    uint32x4_t cmp2 = vcagtq_f32(n, d->scale_thresh);
-    float32x4_t r2 = vmulq_f32(s1, s1);
-    float32x4_t r1 = vmulq_f32(vfmaq_f32(s2, poly, s2), s1);
-    /* Similar to r1 but avoids double rounding in the subnormal range.  */
-    float32x4_t r0 = vfmaq_f32(scale, poly, scale);
-    float32x4_t r = vbslq_f32(cmp1, r1, r0);
-    return vbslq_f32(cmp2, r2, r);
-}
-
-#endif
-
-float32x4_t v_expf(float32x4_t x) {
-    const struct data *d = &data;
-    __asm__("" : "+r"(d));
-    float32x4_t n, r, r2, scale, p, q, poly, z;
-    uint32x4_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
-    /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
-    cmp = vcgeq_u32(vsubq_u32(vandq_u32(vreinterpretq_u32_f32(x), v_u32(0x7fffffff)), TinyBound),
-                    SpecialBound);
-    float32x4_t xm = x;
-    /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-       special case handler to fix special lanes later. This is only necessary if
-       fenv exceptions are to be triggered correctly.  */
-    if (unlikely(v_any_u32(cmp)))
-        x = vbslq_f32(cmp, v_f32(1), x);
-#endif
-
-    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-       x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-    z = vfmaq_f32(d->shift, x, d->inv_ln2);
-    n = vsubq_f32(z, d->shift);
-    r = vfmsq_f32(x, n, d->ln2_hi);
-    r = vfmsq_f32(r, n, d->ln2_lo);
-    e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
-    scale = vreinterpretq_f32_u32(vaddq_u32(e, d->exponent_bias));
-
-#if !WANT_SIMD_EXCEPT
-    cmp = vcagtq_f32(n, d->special_bound);
-#endif
-
-    r2 = vmulq_f32(r, r);
-    p = vfmaq_f32(C(1), C(0), r);
-    q = vfmaq_f32(C(3), C(2), r);
-    q = vfmaq_f32(q, p, r2);
-    p = vmulq_f32(C(4), r);
-    poly = vfmaq_f32(p, q, r2);
-
-    if (unlikely(v_any_u32(cmp)))
-#if WANT_SIMD_EXCEPT
-        return special_case(xm, vfmaq_f32(scale, poly, scale), cmp);
-#else
-        return special_case(poly, n, e, cmp, scale, d);
-#endif
-
-    return vfmaq_f32(scale, poly, scale);
-}
-
-#endif // __aarch64__
diff --git a/llamafile/version.h b/llamafile/version.h
index 881249e70f..7ed1afe3d0 100644
--- a/llamafile/version.h
+++ b/llamafile/version.h
@@ -2,7 +2,7 @@
 
 #define LLAMAFILE_MAJOR 0
 #define LLAMAFILE_MINOR 8
-#define LLAMAFILE_PATCH 1
+#define LLAMAFILE_PATCH 2
 #define LLAMAFILE_VERSION \
     (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH)