From 960968e3ac5435becf16b132c7c81ea5d1df8a6a Mon Sep 17 00:00:00 2001
From: Troels Henriksen <athas@sigkill.dk>
Date: Thu, 10 Aug 2023 12:29:31 +0200
Subject: [PATCH] Define this centrally.

---
 rts/c/half.h                             |   6 +-
 rts/c/scalar.h                           | 200 +++++++++++------------
 rts/cuda/prelude.cu                      |   1 +
 rts/opencl/prelude.cl                    |   2 +
 src/Futhark/CodeGen/Backends/GenericC.hs |   1 +
 5 files changed, 103 insertions(+), 107 deletions(-)

diff --git a/rts/c/half.h b/rts/c/half.h
index 0f9a1231ab..bc4233ccac 100644
--- a/rts/c/half.h
+++ b/rts/c/half.h
@@ -217,7 +217,7 @@ __constant static const unsigned short offset_table[64] = {
   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
 
-static uint16_t float2halfbits(float value) {
+SCALAR_FUN_ATTR uint16_t float2halfbits(float value) {
   union { float x; uint32_t y; } u;
   u.x = value;
   uint32_t bits = u.y;
@@ -227,7 +227,7 @@ static uint16_t float2halfbits(float value) {
   return hbits;
 }
 
-static float halfbits2float(uint16_t value) {
+SCALAR_FUN_ATTR float halfbits2float(uint16_t value) {
   uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
 
   union { uint32_t x; float y; } u;
@@ -235,7 +235,7 @@ static float halfbits2float(uint16_t value) {
   return u.y;
 }
 
-static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {
+SCALAR_FUN_ATTR uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {
   int fabs = from & 0x7FFF, tabs = to & 0x7FFF;
   if(fabs > 0x7C00 || tabs > 0x7C00) {
     return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);
diff --git a/rts/c/scalar.h b/rts/c/scalar.h
index 8098ca1eda..d01e7e6071 100644
--- a/rts/c/scalar.h
+++ b/rts/c/scalar.h
@@ -17,14 +17,6 @@
 // Double-precision definitions are only included if the preprocessor
 // macro FUTHARK_F64_ENABLED is set.
 
-#if defined(FUTHARK_CUDA)
-#define SCALAR_FUN_ATTR __device__ static inline
-#elif defined(OPENCL_DEVICE_CODE)
-#define SCALAR_FUN_ATTR static inline
-#else
-#define SCALAR_FUN_ATTR static inline
-#endif
-
 SCALAR_FUN_ATTR uint8_t add8(uint8_t x, uint8_t y) {
   return x + y;
 }
@@ -1215,19 +1207,19 @@ SCALAR_FUN_ATTR int64_t btoi_bool_i64(bool x) {
 #define zext_i64_i32(x) ((int32_t) (uint64_t) (x))
 #define zext_i64_i64(x) ((int64_t) (uint64_t) (x))
 
-static int8_t abs8(int8_t x) {
+SCALAR_FUN_ATTR int8_t abs8(int8_t x) {
   return (int8_t)abs(x);
 }
 
-static int16_t abs16(int16_t x) {
+SCALAR_FUN_ATTR int16_t abs16(int16_t x) {
   return (int16_t)abs(x);
 }
 
-static int32_t abs32(int32_t x) {
+SCALAR_FUN_ATTR int32_t abs32(int32_t x) {
   return abs(x);
 }
 
-static int64_t abs64(int64_t x) {
+SCALAR_FUN_ATTR int64_t abs64(int64_t x) {
 #if defined(__OPENCL_VERSION__) || defined(ISPC)
   return abs(x);
 #else
@@ -1236,60 +1228,60 @@ static int64_t abs64(int64_t x) {
 }
 
 #if defined(__OPENCL_VERSION__)
-static int32_t futrts_popc8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) {
   return popcount(x);
 }
 
-static int32_t futrts_popc16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) {
   return popcount(x);
 }
 
-static int32_t futrts_popc32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) {
   return popcount(x);
 }
 
-static int32_t futrts_popc64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) {
   return popcount(x);
 }
 #elif defined(__CUDA_ARCH__)
 
-static int32_t futrts_popc8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) {
   return __popc(zext_i8_i32(x));
 }
 
-static int32_t futrts_popc16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) {
   return __popc(zext_i16_i32(x));
 }
 
-static int32_t futrts_popc32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) {
   return __popc(x);
 }
 
-static int32_t futrts_popc64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) {
   return __popcll(x);
 }
 
 #else // Not OpenCL or CUDA, but plain C.
 
-static int32_t futrts_popc8(uint8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc8(uint8_t x) {
   int c = 0;
   for (; x; ++c) { x &= x - 1; }
   return c;
 }
 
-static int32_t futrts_popc16(uint16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc16(uint16_t x) {
   int c = 0;
   for (; x; ++c) { x &= x - 1; }
   return c;
 }
 
-static int32_t futrts_popc32(uint32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc32(uint32_t x) {
   int c = 0;
   for (; x; ++c) { x &= x - 1; }
   return c;
 }
 
-static int32_t futrts_popc64(uint64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_popc64(uint64_t x) {
   int c = 0;
   for (; x; ++c) { x &= x - 1; }
   return c;
@@ -1297,28 +1289,28 @@ static int32_t futrts_popc64(uint64_t x) {
 #endif
 
 #if defined(__OPENCL_VERSION__)
-static uint8_t  futrts_umul_hi8 ( uint8_t a,  uint8_t b) { return mul_hi(a, b); }
-static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }
-static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }
-static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }
-static uint8_t  futrts_smul_hi8 ( int8_t a,  int8_t b) { return mul_hi(a, b); }
-static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); }
-static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }
-static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint8_t  futrts_umul_hi8 ( uint8_t a,  uint8_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint8_t  futrts_smul_hi8 ( int8_t a,  int8_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }
 #elif defined(__CUDA_ARCH__)
-static  uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
-static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
-static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }
-static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }
-static  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
-static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
-static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
-static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
+SCALAR_FUN_ATTR  uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }
+SCALAR_FUN_ATTR  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
+SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
 #elif ISPC
-static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
-static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
-static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
-static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
+SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
   uint64_t ah = a >> 32;
   uint64_t al = a & 0xffffffff;
   uint64_t bh = b >> 32;
@@ -1341,10 +1333,10 @@ static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
 
   return h;
 }
-static  int8_t futrts_smul_hi8 ( int8_t a,  int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
-static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
-static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
-static int64_t futrts_smul_hi64(int64_t a, int64_t b) {
+SCALAR_FUN_ATTR  int8_t futrts_smul_hi8 ( int8_t a,  int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) {
   uint64_t ah = a >> 32;
   uint64_t al = a & 0xffffffff;
   uint64_t bh = b >> 32;
@@ -1369,132 +1361,132 @@ static int64_t futrts_smul_hi64(int64_t a, int64_t b) {
 }
 
 #else // Not OpenCL, ISPC, or CUDA, but plain C.
-static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
-static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
-static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
-static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }
-static int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
-static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
-static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }
-static int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }
+SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
+SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
+SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
+SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }
+SCALAR_FUN_ATTR int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
+SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
+SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }
+SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }
 #endif
 
 #if defined(__OPENCL_VERSION__)
-static  uint8_t futrts_umad_hi8 ( uint8_t a,  uint8_t b,  uint8_t c) { return mad_hi(a, b, c); }
-static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }
-static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }
-static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }
-static  uint8_t futrts_smad_hi8( int8_t a,  int8_t b,   int8_t c) { return mad_hi(a, b, c); }
-static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }
-static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }
-static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR  uint8_t futrts_umad_hi8 ( uint8_t a,  uint8_t b,  uint8_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR  uint8_t futrts_smad_hi8( int8_t a,  int8_t b,   int8_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }
+SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }
 #else // Not OpenCL
 
-static  uint8_t futrts_umad_hi8( uint8_t a,  uint8_t b,  uint8_t c) { return futrts_umul_hi8(a, b) + c; }
-static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }
-static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }
-static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }
-static  uint8_t futrts_smad_hi8 ( int8_t a,  int8_t b,  int8_t c) { return futrts_smul_hi8(a, b) + c; }
-static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }
-static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }
-static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }
+SCALAR_FUN_ATTR  uint8_t futrts_umad_hi8( uint8_t a,  uint8_t b,  uint8_t c) { return futrts_umul_hi8(a, b) + c; }
+SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }
+SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }
+SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }
+SCALAR_FUN_ATTR  uint8_t futrts_smad_hi8 ( int8_t a,  int8_t b,  int8_t c) { return futrts_smul_hi8(a, b) + c; }
+SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }
+SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }
+SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }
 #endif
 
 #if defined(__OPENCL_VERSION__)
-static int32_t futrts_clzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
   return clz(x);
 }
 
-static int32_t futrts_clzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
   return clz(x);
 }
 
-static int32_t futrts_clzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
   return clz(x);
 }
 
-static int32_t futrts_clzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
   return clz(x);
 }
 
 #elif defined(__CUDA_ARCH__)
 
-static int32_t futrts_clzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
   return __clz(zext_i8_i32(x)) - 24;
 }
 
-static int32_t futrts_clzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
   return __clz(zext_i16_i32(x)) - 16;
 }
 
-static int32_t futrts_clzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
   return __clz(x);
 }
 
-static int32_t futrts_clzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
   return __clzll(x);
 }
 
 #elif ISPC
 
-static int32_t futrts_clzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
   return count_leading_zeros((int32_t)(uint8_t)x)-24;
 }
 
-static int32_t futrts_clzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
   return count_leading_zeros((int32_t)(uint16_t)x)-16;
 }
 
-static int32_t futrts_clzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
   return count_leading_zeros(x);
 }
 
-static int32_t futrts_clzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
   return count_leading_zeros(x);
 }
 
 #else // Not OpenCL, ISPC or CUDA, but plain C.
 
-static int32_t futrts_clzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
   return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;
 }
 
-static int32_t futrts_clzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) {
   return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;
 }
 
-static int32_t futrts_clzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) {
   return x == 0 ? 32 : __builtin_clz((uint32_t)x);
 }
 
-static int32_t futrts_clzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
   return x == 0 ? 64 : __builtin_clzll((uint64_t)x);
 }
 #endif
 
 #if defined(__OPENCL_VERSION__)
-static int32_t futrts_ctzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
   int i = 0;
   for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
     ;
   return i;
 }
 
-static int32_t futrts_ctzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
   int i = 0;
   for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
     ;
   return i;
 }
 
-static int32_t futrts_ctzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
   int i = 0;
   for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
     ;
   return i;
 }
 
-static int32_t futrts_ctzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
   int i = 0;
   for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
     ;
@@ -1503,59 +1495,59 @@ static int32_t futrts_ctzz64(int64_t x) {
 
 #elif defined(__CUDA_ARCH__)
 
-static int32_t futrts_ctzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
   int y = __ffs(x);
   return y == 0 ? 8 : y - 1;
 }
 
-static int32_t futrts_ctzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
   int y = __ffs(x);
   return y == 0 ? 16 : y - 1;
 }
 
-static int32_t futrts_ctzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
   int y = __ffs(x);
   return y == 0 ? 32 : y - 1;
 }
 
-static int32_t futrts_ctzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
   int y = __ffsll(x);
   return y == 0 ? 64 : y - 1;
 }
 
 #elif ISPC
 
-static int32_t futrts_ctzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
   return x == 0 ? 8 : count_trailing_zeros((int32_t)x);
 }
 
-static int32_t futrts_ctzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
   return x == 0 ? 16 : count_trailing_zeros((int32_t)x);
 }
 
-static int32_t futrts_ctzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
   return count_trailing_zeros(x);
 }
 
-static int32_t futrts_ctzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
   return count_trailing_zeros(x);
 }
 
 #else // Not OpenCL or CUDA, but plain C.
 
-static int32_t futrts_ctzz8(int8_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
   return x == 0 ? 8 : __builtin_ctz((uint32_t)x);
 }
 
-static int32_t futrts_ctzz16(int16_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) {
   return x == 0 ? 16 : __builtin_ctz((uint32_t)x);
 }
 
-static int32_t futrts_ctzz32(int32_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) {
   return x == 0 ? 32 : __builtin_ctz((uint32_t)x);
 }
 
-static int32_t futrts_ctzz64(int64_t x) {
+SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
   return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);
 }
 #endif
diff --git a/rts/cuda/prelude.cu b/rts/cuda/prelude.cu
index 71cdfc9f01..e4700b1422 100644
--- a/rts/cuda/prelude.cu
+++ b/rts/cuda/prelude.cu
@@ -1,5 +1,6 @@
 // start of prelude.cu
 
+#define SCALAR_FUN_ATTR __device__ static inline
 #define FUTHARK_F64_ENABLED
 
 typedef char int8_t;
diff --git a/rts/opencl/prelude.cl b/rts/opencl/prelude.cl
index 9ee88a2840..6284dc5746 100644
--- a/rts/opencl/prelude.cl
+++ b/rts/opencl/prelude.cl
@@ -1,5 +1,7 @@
 // Start of prelude.cl
 
+#define SCALAR_FUN_ATTR static inline
+
 typedef char int8_t;
 typedef short int16_t;
 typedef int int32_t;
diff --git a/src/Futhark/CodeGen/Backends/GenericC.hs b/src/Futhark/CodeGen/Backends/GenericC.hs
index 3f7cf49d10..93fe442a0f 100644
--- a/src/Futhark/CodeGen/Backends/GenericC.hs
+++ b/src/Futhark/CodeGen/Backends/GenericC.hs
@@ -428,6 +428,7 @@ $errorsH
 #undef NDEBUG
 #include <assert.h>
 #include <stdarg.h>
+#define SCALAR_FUN_ATTR static inline
 $utilH
 $cacheH
 $halfH